#!/usr/bin/env python3
"""
WooCommerce E-shop Scraper
Scrapes product data from a WooCommerce-based demo shop.

Usage:
    python scraper_eshop.py
"""

import requests
from bs4 import BeautifulSoup
import json
import time
import logging
from typing import Dict, List, Optional
from urllib.parse import urljoin
import re

# Configuration
BASE_URL = "http://demo-eshop.local"  # άλλαξέ το αν χρειαστεί
SHOP_URL = f"{BASE_URL}/shop/"
OUTPUT_FILE = "eshop_scrape.json"
REQUEST_DELAY = 1      # seconds between product requests
MAX_RETRIES = 3
TIMEOUT = 10

# Logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


def fetch_page(url: str, retries: int = MAX_RETRIES) -> Optional[str]:
    """
    Φέρνει μια σελίδα με retries και απλό exponential backoff.
    """
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0 Safari/537.36"
        )
    }

    for attempt in range(retries):
        try:
            logger.info(f"Fetching: {url} (attempt {attempt + 1}/{retries})")
            resp = requests.get(url, headers=headers, timeout=TIMEOUT)
            resp.raise_for_status()
            return resp.text
        except requests.exceptions.RequestException as e:
            logger.warning(f"Request failed: {e}")
            if attempt < retries - 1:
                time.sleep(2 ** attempt)
            else:
                logger.error(f"Giving up on {url}")
    return None


def parse_product_list(html: str) -> List[str]:
    """
    Από τη σελίδα /shop/ (ή /shop/page/N) επιστρέφει URLs προϊόντων.
    """
    soup = BeautifulSoup(html, "lxml")
    product_urls: List[str] = []

    product_links = soup.select("ul.products li.product a.woocommerce-LoopProduct-link")
    for a in product_links:
        href = a.get("href")
        if href:
            product_urls.append(href)

    logger.info(f"Found {len(product_urls)} products on listing page")
    return product_urls


def get_next_page_url(html: str, current_url: str) -> Optional[str]:
    """
    Βρίσκει το URL της επόμενης σελίδας (pagination).
    """
    soup = BeautifulSoup(html, "lxml")
    next_link = soup.select_one("nav.woocommerce-pagination a.next")
    if next_link and next_link.get("href"):
        return next_link["href"]
    return None


def extract_rating_from_style(style_str: str) -> Optional[float]:
    """
    Από width style (π.χ. "width:80%") επιστρέφει rating 0–5.
    """
    if not style_str:
        return None
    match = re.search(r"width:\s*([\d.]+)%", style_str)
    if match:
        try:
            percentage = float(match.group(1))
            return round((percentage / 100.0) * 5.0, 2)
        except ValueError:
            return None
    return None


def extract_rating_from_container(container) -> Optional[float]:
    """
    Παίρνει ένα .star-rating element και προσπαθεί να βγάλει αριθμητικό rating,
    είτε από span[style] (width:XX%), είτε από κείμενο τύπου 'Rated 4.5 out of 5'.
    """
    if not container:
        return None

    # Προτίμησε το span με style (τυπικό WooCommerce markup)
    span = container.select_one("span")
    style_str = ""
    if span and span.get("style"):
        style_str = span.get("style", "")
    elif container.get("style"):
        style_str = container.get("style", "")

    rating = extract_rating_from_style(style_str) if style_str else None
    if rating is not None:
        return rating

    # Fallback: από κείμενο "Rated X out of 5"
    text = container.get_text(" ", strip=True)
    match = re.search(r"Rated\s+([\d.]+)\s+out of", text)
    if match:
        try:
            return float(match.group(1))
        except ValueError:
            return None

    return None


def parse_product_page(html: str, url: str) -> Dict:
    """
    Κάνει parse μία σελίδα προϊόντος και επιστρέφει πλήρες dict με δεδομένα.
    """
    soup = BeautifulSoup(html, "lxml")
    product_data: Dict = {"url": url}

    try:
        # Product ID από div id="product-XX"
        product_div = soup.select_one('div[id^="product-"]')
        if product_div:
            pid = product_div.get("id", "").replace("product-", "")
            product_data["product_id"] = pid

        # Τίτλος προϊόντος
        title_el = soup.select_one("h1.product_title")
        product_data["name"] = title_el.get_text(strip=True) if title_el else None

        # SKU
        sku_el = soup.select_one(".sku")
        product_data["sku"] = sku_el.get_text(strip=True) if sku_el else None

        # Κατηγορίες (product_meta → posted_in)
        categories: List[str] = []
        cat_links = soup.select(".product_meta .posted_in a")
        for c in cat_links:
            categories.append(c.get_text(strip=True))
        product_data["categories"] = categories

        # Brand (αν υπάρχει ως taxonomy ή πολιτισμένο meta)
        brand_el = None
        # κλασική περίπτωση με custom taxonomy brand
        brand_el = soup.select_one(".product_meta .posted_in .brand a")
        if not brand_el:
            # fallback: το δεύτερο link στη posted_in μπορεί να είναι brand
            if len(cat_links) > 1:
                brand_el = cat_links[-1]
        product_data["brand"] = brand_el.get_text(strip=True) if brand_el else None

        # Τιμές
        price_elem = soup.select_one("p.price")
        if price_elem:
            # Εντοπισμός sale / regular
            sale_span = price_elem.select_one("ins .woocommerce-Price-amount")
            regular_span = None
            if sale_span:
                # όταν υπάρχει έκπτωση, το regular είναι συνήθως στο <del>
                regular_span = price_elem.select_one("del .woocommerce-Price-amount")
                if regular_span is None:
                    # fallback: πρώτο amount
                    regular_span = price_elem.select_one(".woocommerce-Price-amount")
            else:
                # χωρίς έκπτωση, μοναδική τιμή = regular
                regular_span = price_elem.select_one(".woocommerce-Price-amount")

            # Regular price
            if regular_span:
                price_text = (
                    regular_span.get_text(strip=True)
                    .replace("€", "")
                    .replace(",", ".")
                    .strip()
                )
                try:
                    product_data["price_regular"] = float(
                        re.sub(r"[^\d.]", "", price_text)
                    )
                except Exception:
                    product_data["price_regular"] = None
            else:
                product_data["price_regular"] = None

            # Sale price
            if sale_span:
                sale_text = (
                    sale_span.get_text(strip=True)
                    .replace("€", "")
                    .replace(",", ".")
                    .strip()
                )
                try:
                    product_data["price_sale"] = float(
                        re.sub(r"[^\d.]", "", sale_text)
                    )
                except Exception:
                    product_data["price_sale"] = None
            else:
                product_data["price_sale"] = None
        else:
            product_data["price_regular"] = None
            product_data["price_sale"] = None

        # Stock status από classes instock / outofstock
        stock_status = "unknown"
        if product_div:
            classes = product_div.get("class", [])
            if "instock" in classes:
                stock_status = "in_stock"
            elif "outofstock" in classes:
                stock_status = "out_of_stock"
        product_data["stock_status"] = stock_status

        # Stock quantity – συνήθως δεν φαίνεται στο frontend, το αφήνουμε None
        product_data["stock_quantity"] = None

        # Tags
        tags: List[str] = []
        tag_links = soup.select(".tagged_as a[rel='tag']")
        for t in tag_links:
            tags.append(t.get_text(strip=True))
        product_data["tags"] = tags

        # Attributes (tab "Additional information")
        attributes: Dict[str, str] = {}
        for row in soup.select(".woocommerce-product-attributes tr"):
            label_el = row.select_one("th")
            value_el = row.select_one("td")
            if label_el and value_el:
                label = label_el.get_text(strip=True)
                value = value_el.get_text(strip=True)
                attributes[label] = value
        product_data["attributes"] = attributes

        # Μέσος όρος rating (πάνω από τον τίτλο)
        rating_elem = soup.select_one(".woocommerce-product-rating .star-rating")
        product_data["average_rating"] = extract_rating_from_container(rating_elem)

        # Πλήθος reviews
        review_count_el = soup.select_one(".woocommerce-review-link")
        if review_count_el:
            # κείμενο τύπου "3 customer reviews"
            text = review_count_el.get_text(strip=True)
            m = re.search(r"(\d+)", text)
            if m:
                product_data["rating_count"] = int(m.group(1))
            else:
                product_data["rating_count"] = 0
        else:
            product_data["rating_count"] = 0

        # Αναλυτικά reviews
        reviews: List[Dict] = []
        review_items = soup.select("ol.commentlist li.review")
        for item in review_items:
            review: Dict = {}

            # Author
            author_el = item.select_one(".woocommerce-review__author")
            review["author"] = author_el.get_text(strip=True) if author_el else None

            # Rating (star-rating μέσα στο review)
            rating_container = item.select_one(".star-rating")
            review["rating"] = extract_rating_from_container(rating_container)

            # Date
            date_el = item.select_one("time.woocommerce-review__published-date")
            review["date"] = (
                date_el.get("datetime", "").split("T")[0] if date_el else None
            )

            # Text
            text_el = item.select_one(".description")
            review["text"] = text_el.get_text(strip=True) if text_el else None

            reviews.append(review)

        product_data["reviews"] = reviews

        # Εικόνες
        images: List[str] = []

        # Main image
        main_img = soup.select_one(".woocommerce-product-gallery__image img")
        if main_img:
            img_url = (
                main_img.get("data-large_image")
                or main_img.get("src")
                or main_img.get("data-src")
            )
            if img_url:
                images.append(img_url)

        # Gallery images
        gallery_divs = soup.select(
            ".woocommerce-product-gallery__wrapper > div[data-thumb]"
        )
        for div in gallery_divs:
            img = div.select_one("img")
            if not img:
                continue
            img_url = (
                img.get("data-large_image")
                or img.get("src")
                or img.get("data-src")
            )
            if img_url and img_url not in images:
                images.append(img_url)

        product_data["images"] = images

        logger.info(f"✓ Parsed product: {product_data.get('name')}")
    except Exception as e:
        logger.error(f"Error parsing product {url}: {e}", exc_info=True)

    return product_data


def scrape_shop(start_url: str = SHOP_URL) -> List[Dict]:
    """
    Κάνει crawl το /shop/ με pagination και επιστρέφει λίστα με όλα τα προϊόντα.
    """
    all_products: List[Dict] = []
    current_url = start_url
    page_num = 1

    while current_url:
        logger.info("=" * 60)
        logger.info(f"Listing page #{page_num}: {current_url}")
        html = fetch_page(current_url)
        if not html:
            logger.warning(f"Could not fetch listing page: {current_url}")
            break

        product_urls = parse_product_list(html)
        if not product_urls:
            logger.info("No product URLs found on this page. Stopping.")
            break

        for product_url in product_urls:
            logger.info(f"  -> scraping product: {product_url}")
            product_html = fetch_page(product_url)
            if not product_html:
                logger.warning(f"  !! failed to fetch product: {product_url}")
                continue

            product_data = parse_product_page(product_html, product_url)
            all_products.append(product_data)

            time.sleep(REQUEST_DELAY)

        # Εύρεση επόμενης σελίδας
        next_url = get_next_page_url(html, current_url)
        if next_url:
            current_url = next_url
            page_num += 1
        else:
            logger.info("No next page found – scraping finished.")
            break

    return all_products


def save_to_json(products: List[Dict], filename: str = OUTPUT_FILE) -> None:
    """
    Αποθηκεύει τα δεδομένα προϊόντων σε JSON.
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump({"products": products}, f, ensure_ascii=False, indent=2)
    logger.info(f"Saved {len(products)} products to {filename}")


def main() -> None:
    """
    Main entry point.
    """
    logger.info("Starting WooCommerce scraper...")
    logger.info(f"Target base URL: {BASE_URL}")

    try:
        products = scrape_shop()
        if products:
            save_to_json(products)
            logger.info("\n" + "=" * 60)
            logger.info("SCRAPING SUMMARY")
            logger.info("=" * 60)
            logger.info(f"Total products scraped: {len(products)}")
            logger.info(
                f"Products with reviews: "
                f"{sum(1 for p in products if p.get('reviews'))}"
            )
            logger.info(
                f"Products with attributes: "
                f"{sum(1 for p in products if p.get('attributes'))}"
            )
            logger.info(f"Output file: {OUTPUT_FILE}")
        else:
            logger.warning("No products were scraped.")
    except KeyboardInterrupt:
        logger.info("Scraping interrupted by user.")
    except Exception as e:
        logger.error(f"Unexpected error: {e}", exc_info=True)


if __name__ == "__main__":
    main()
