Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,12 @@ off_project/.user.yml
__pycache__/

.coverage

# Scraping
loblaws_mapping_output.csv
loblaws_mapping_output.txt
loblaws_products_output.jsonl
logs_loblaw_scraping.txt
logs_loblaw_mapping.txt
output.json
iga_category_product_urls.txt
44 changes: 38 additions & 6 deletions webscraping/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,9 @@

## Languages

- [English](#english)
- [Français](#français)


## English
📌 TODO


## Français

Expand Down Expand Up @@ -57,8 +53,9 @@ uv run flake8 .

## Épiceries

- [Loblaws](#Lowblaws)
- [Loblaws](#Loblaws)
- [Maxi](#Maxi)
- [IGA](#IGA)


### Loblaws
Expand Down Expand Up @@ -149,4 +146,39 @@ Arguments:
|-------------|-------|------------|------------------|
| `--start` | `int` | `0` | Index de début |
| `--end` | `int` | `1000` | Index de fin |
| `--website` | `str` | `"Loblaws"` | Choix : `"Loblaws"`, `"Maxi"` |
| `--website` | `str` | `"Loblaws"` | Choix : `"Loblaws"`, `"Maxi"` |

### IGA

Contrairement à Loblaws et Maxi, le site de IGA ne propose pas de sitemap et repose fortement sur du contenu dynamique chargé via JavaScript.

Pour cette raison, une approche différente a été utilisée basée sur Scrapy + Playwright.

Le scraping IGA repose sur le script `iga_products.py` et est divisé en deux étapes :

- `Exploration d’une catégorie`:
- Parcours des pages de catégorie
- Gestion de la pagination (?page=n)
- Extraction des URLs produits
- `Scraping des produits`:
- Visite de chaque page produit
- Extraction des données via JSON-LD
Retourne : Un JSON structuré de produits alimentaires IGA, avec leurs données nutritionnelles et descriptives.

#### Éxécution:

`iga_products.py`:
```bash
cd webscraping/scrapy
uv run scrapy runspider iga_products.py -O output.json
```

#### Limitations:
Le site IGA utilise des mécanismes de protection anti-bot.

Après un certain volume de requêtes, le serveur peut retourner : `403 Forbidden`
Ce comportement :
- n’est pas lié au code
- provient d’un blocage côté serveur
- dépend du volume de requêtes et de l’IP
<br>
278 changes: 278 additions & 0 deletions webscraping/iga_scraper/iga_products.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
import scrapy
import json
from urllib.parse import urljoin
from scrapy_playwright.page import PageMethod


class IgaCategoryProductsSpider(scrapy.Spider):
name = "iga_category_products"
allowed_domains = ["iga.ca"]

category_name = "Produits_surgeles"
base_category_url = "https://www.iga.ca/fr/produits/cat%C3%A9gorie/Produits__surgel%C3%A9s"

custom_settings = {
"DOWNLOAD_HANDLERS": {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"ROBOTSTXT_OBEY": False,
"PLAYWRIGHT_BROWSER_TYPE": "chromium",
"PLAYWRIGHT_LAUNCH_OPTIONS": {
"headless": True,
},
"DEFAULT_REQUEST_HEADERS": {
"Accept-Language": "fr-CA,fr;q=0.9,en;q=0.8",
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"
),
},
"CONCURRENT_REQUESTS": 8,
}

def start_requests(self):
yield scrapy.Request(
url=self.base_category_url,
meta={
"playwright": True,
"playwright_include_page": True,
"playwright_page_methods": [
PageMethod("wait_for_load_state", "domcontentloaded"),
PageMethod("wait_for_timeout", 2500),
],
"page_number": 1,
"all_product_urls": [],
},
callback=self.parse_category_page,
)

async def parse_category_page(self, response):
page = response.meta["playwright_page"]
page_number = response.meta["page_number"]
all_product_urls = set(response.meta.get("all_product_urls", []))

try:
self.logger.info("Category page %s URL: %s", page_number, response.url)

await self.wait_for_products_to_settle(page)

page_urls = await self.extract_product_urls_from_current_page(page)

self.logger.info(
"Page %s: %s URLs produit trouvées",
page_number,
len(page_urls),
)

before_count = len(all_product_urls)
all_product_urls.update(page_urls)
after_count = len(all_product_urls)

self.logger.info(
"Page %s: cumul avant=%s, après=%s, ajoutées=%s",
page_number,
before_count,
after_count,
after_count - before_count,
)

finally:
await page.close()


if after_count == before_count and page_number > 1:
self.logger.info("Aucune nouvelle URL à la page %s. Fin de pagination.", page_number)

with open("iga_category_product_urls.txt", "w", encoding="utf-8") as f:
for url in sorted(all_product_urls):
f.write(url + "\n")

for product_url in sorted(all_product_urls):
yield scrapy.Request(
url=product_url,
meta={
"category_name": self.category_name,
"source_category_url": self.base_category_url,
},
callback=self.parse_product,
)
return

next_page_number = page_number + 1
next_page_url = f"{self.base_category_url}?page={next_page_number}"

yield scrapy.Request(
url=next_page_url,
meta={
"playwright": True,
"playwright_include_page": True,
"playwright_page_methods": [
PageMethod("wait_for_load_state", "domcontentloaded"),
PageMethod("wait_for_timeout", 2500),
],
"page_number": next_page_number,
"all_product_urls": sorted(all_product_urls),
},
callback=self.parse_category_page,
)

async def wait_for_products_to_settle(self, page):
previous_count = -1
stable_rounds = 0

for _ in range(15):
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(700)

hrefs = await page.eval_on_selector_all(
"a[href]",
"""
els => els
.map(a => a.getAttribute('href'))
.filter(h => h && h.includes('/fr/produits/'))
"""
)

current_count = len({
urljoin("https://www.iga.ca", h.split("?")[0])
for h in hrefs
if h and "/fr/produits/" in h
})

if current_count == previous_count:
stable_rounds += 1
else:
stable_rounds = 0

previous_count = current_count

if stable_rounds >= 2:
break

async def extract_product_urls_from_current_page(self, page):
hrefs = await page.eval_on_selector_all(
"a[href]",
"""
els => els
.map(a => a.getAttribute('href'))
.filter(h => h && h.includes('/fr/produits/'))
"""
)

product_urls = set()

for href in hrefs:
if not self.is_product_url(href):
continue

href = href.split("?")[0]
full_url = urljoin("https://www.iga.ca", href)
product_urls.add(full_url)

return product_urls

def parse_product(self, response):
product_data = self.extract_jsonld_product(response)

if not product_data:
yield {
"url": response.url,
"category_name": response.meta.get("category_name"),
"source_category_url": response.meta.get("source_category_url"),
"error": "Aucun JSON-LD Product trouvé",
}
return

offers = product_data.get("offers", {})
nutrition = product_data.get("nutrition", {})
brand = product_data.get("brand", {})
breadcrumb = product_data.get("breadcrumb", {})

sub_category = ""
items = breadcrumb.get("itemListElement", [])
if len(items) >= 3:
sub_category = items[-2].get("name", "")

images = product_data.get("image", [])
image_url = images[0] if isinstance(images, list) and images else images

availability = offers.get("availability", "")
if isinstance(availability, str):
availability = availability.replace("https://schema.org/", "")

yield {
"category_name": response.meta.get("category_name"),
"source_category_url": response.meta.get("source_category_url"),
"url": response.url,
"name": product_data.get("name"),
"sku": product_data.get("sku"),
"brand": brand.get("name") if isinstance(brand, dict) else brand,
"description": product_data.get("description"),
"price": offers.get("price"),
"price_currency": offers.get("priceCurrency"),
"availability": availability,
"product_url": offers.get("url"),
"image_url": image_url,
"serving_size": nutrition.get("servingSize"),
"calories": nutrition.get("calories"),
"fat": nutrition.get("fatContent"),
"saturated_fat": nutrition.get("saturatedFatContent"),
"sodium": nutrition.get("sodiumContent"),
"carbohydrate": nutrition.get("carbohydrateContent"),
"fiber": nutrition.get("fiberContent"),
"sugar": nutrition.get("sugarContent"),
"protein": nutrition.get("proteinContent"),
"calcium": nutrition.get("calciumContent"),
"iron": nutrition.get("ironContent"),
"potassium": nutrition.get("potassiumContent"),
"sub_category": sub_category,
}

def extract_jsonld_product(self, response):
scripts = response.css('script[type="application/ld+json"]::text').getall()

for script in scripts:
try:
data = json.loads(script)

if isinstance(data, list):
for entry in data:
if isinstance(entry, dict) and entry.get("@type") == "Product":
return entry

elif isinstance(data, dict) and data.get("@type") == "Product":
return data

except Exception:
continue

return None

def is_product_url(self, href: str) -> bool:
if not href:
return False

href_lower = href.lower()

if "/fr/produits/" not in href_lower:
return False

excluded = [
"categorie",
"cat%C3%A9gorie",
"catégorie",
"category",
"cookie",
"consent",
"onetrust",
"#",
]

for token in excluded:
if token.lower() in href_lower:
return False

return True
4 changes: 4 additions & 0 deletions webscraping/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@ requires-python = "==3.14.*"
dependencies = [
"beautifulsoup4>=4.14.3",
"lxml>=6.0.2",
"pandas>=3.0.1",
"playwright>=1.58.0",
"pytest-cov>=7.1.0",
"requests>=2.32.5",
"scrapy-playwright>=0.0.46",
"selenium>=4.41.0",
"tqdm>=4.67.3",
]
Expand Down
Loading
Loading