# Import core packages
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
from collections import Counter
import plotly.express as px

# This is the line to add
pio.renderers.default = 'notebook'

# Dynamically add views and tables 

# Set pandas options
pd.set_option('display.max_columns', None)

# Base paths
BASE = 'output_data' # Standardized to lowercase
CLEANED_DIR = os.path.join(BASE, 'cleaned_tables')
VIEWS_DIR = os.path.join(BASE, 'views')

# Helper function to load all CSVs from a folder into a dict
def load_csvs_from_directory(directory_path):
    return {
        os.path.splitext(filename)[0].replace("cleaned_", ""): pd.read_csv(os.path.join(directory_path, filename))
        for filename in os.listdir(directory_path)
        if filename.endswith('.csv')
    }

# Load cleaned tables and views
cleaned = load_csvs_from_directory(CLEANED_DIR)
views = load_csvs_from_directory(VIEWS_DIR)

# See available tables
list(cleaned.keys())

['returns',
 'customers',
 'orders',
 'return_items',
 'order_items',
 'product_catalog']

# See available modeled views
list(views.keys())

['monthly_signup_channel_sales_returns',
 'top_customers_by_returns',
 'monthly_payment_sales_returns',
 'return_reason_summary',
 'region_summary_by_state',
 'monthly_sales_returns_summary',
 'monthly_clv_sales_returns',
 'monthly_loyalty_sales_returns',
 'customer_segment_sales_returns',
 'return_rate_by_product',
 'shipping_return_impact_summary',
 'category_monthly_sales_returns',
 'monthly_channel_sales_returns']

# Monthly Sales vs. Refunds (Grouped Bar Chart)

df = views["monthly_sales_returns_summary"].copy()
df["month"] = pd.to_datetime(df["month"])
df["month_label"] = df["month"].dt.strftime("%B %Y")

fig = go.Figure()

fig.add_trace(go.Bar(
    x=df["month_label"],
    y=df["total_sales"],
    name="Total Sales",
    marker_color="royalblue"
))

fig.add_trace(go.Bar(
    x=df["month_label"],
    y=df["total_refunds"],
    name="Total Refunds",
    marker_color="firebrick"
))

fig.update_layout(
    title="🧾 Monthly Sales vs Refunds",
    xaxis_title="Month",
    yaxis_title="Amount ($)",
    barmode="group",
    hovermode="x unified",
    xaxis_tickangle=-45,
    legend=dict(orientation="h", y=-0.2, x=0.5, xanchor="center"),
    height=450
)

fig.show()

# Build visual of MoM sales

# Load and prep
df = views["monthly_sales_returns_summary"].copy()
df["month"] = pd.to_datetime(df["month"])

# Plot monthly sales
fig = px.bar(
    df,
    x="month",
    y="total_sales",
    color="total_sales",
    color_continuous_scale="Greens",
    title="Monthly Total Sales",
    labels={"total_sales": "Total Sales ($)"}
)

fig.update_layout(hovermode="x unified")
fig.show()

# Build visual of Total MoM refunded

df = views["monthly_sales_returns_summary"].copy()
df["month"] = pd.to_datetime(df["month"])

fig = px.bar(
    df,
    x="month",
    y="percent_revenue_returned",
    color="percent_revenue_returned",
    color_continuous_scale="Reds",
    title="Monthly % Revenue Refunded",
    labels={"percent_revenue_returned": "% Revenue Refunded"}
)

fig.update_layout(hovermode="x unified")
fig.show()

# Catagorize Unknown Region by Code

raw_region_df = views["region_summary_by_state"]
unknowns = raw_region_df[raw_region_df["region"].str.lower() == "unknown"]
unknowns

# Standarize Outlier Terrirories 

territory_map = {
    "fm": "Micronesia",
    "mh": "Marshall Islands",
    "pw": "Palau",
    "_a": "Unassigned"
}

# Apply mapping to create a better label
raw_region_df["region"] = raw_region_df.apply(
    lambda row: territory_map.get(row["state_code"].lower(), row["region"]),
    axis=1
)

# Total sales by region

region_df = views["region_summary_by_state"].copy()
region_df["order_month"] = pd.to_datetime(region_df["order_month"], errors="coerce")
region_df = region_df.dropna(subset=["order_month", "region", "total_sales"]).copy()

region_sales = (
    region_df.groupby("region", as_index=False)["total_sales"]
    .sum()
    .sort_values("total_sales", ascending=True)
)

fig_sales = px.bar(
    region_sales,
    x="total_sales",
    y="region",
    orientation="h",
    title="💰 Total Sales by Region",
    labels={"total_sales": "Total Sales ($)", "region": "Region"}
)
fig_sales.update_layout(xaxis_tickformat="$,.0f", height=400)
fig_sales.show()

# Refund rate by region 

region_returns = (
    region_df.groupby("region", as_index=False)
    .agg(
        total_sales=("total_sales", "sum"),
        total_refunds=("total_refunds", "sum")
    )
)

region_returns["return_rate"] = region_returns["total_refunds"] / region_returns["total_sales"]
region_returns = region_returns.sort_values("return_rate", ascending=True)

# P0t

fig_returns = px.bar(
    region_returns,
    x="return_rate",
    y="region",
    orientation="h",
    title="🔁 Refund Rate by Region",
    labels={"return_rate": "Refund Rate", "region": "Region"},
    color_discrete_sequence=["crimson"]
)

fig_returns.update_layout(xaxis_tickformat=".1%", height=400)
fig_returns.show()

# MoM order Volume and Return Rate by Shipping Speed

# Load and prepare data
df = views["shipping_return_impact_summary"].copy()
df["order_month"] = pd.to_datetime(df["order_month"], errors="coerce")
df = df.dropna(subset=["order_month", "shipping_speed", "total_orders", "revenue_refunded_pct"]).copy()
df["month_label"] = df["order_month"].dt.strftime("%B %Y")

# Set up subplots
fig = make_subplots(
    rows=2,
    cols=1,
    shared_xaxes=True,
    vertical_spacing=0.12,
    subplot_titles=[
        "Monthly Order Volume by Shipping Speed",
        "Revenue Refunded % by Shipping Speed"
    ]
)

# Plot 1: Total Orders (bar)
for speed in df["shipping_speed"].unique():
    subset = df[df["shipping_speed"] == speed]
    fig.add_trace(
        go.Bar(
            x=subset["month_label"],
            y=subset["total_orders"],
            name=f"{speed.title()} Orders"
        ),
        row=1,
        col=1
    )

# Plot 2: Revenue Refunded % (line)
for speed in df["shipping_speed"].unique():
    subset = df[df["shipping_speed"] == speed]
    fig.add_trace(
        go.Scatter(
            x=subset["month_label"],
            y=subset["revenue_refunded_pct"],
            mode="lines+markers",
            name=f"{speed.title()} Refund %",
            hovertemplate="%{x}<br>%{y:.1f}%<extra></extra>"
        ),
        row=2,
        col=1
    )

# Final layout
fig.update_layout(
    height=750,
    title_text="Shipping Speed Analysis: Order Volume and Refund % (MoM)",
    barmode="group",
    xaxis=dict(title="Month"),
    yaxis=dict(title="Total Orders"),
    yaxis2=dict(title="Revenue Refunded (%)"),
    legend=dict(orientation="h", y=-0.2, x=0.5, xanchor="center"),
    hovermode="x unified"
)

fig.show()

# Percentage of Overnight shipping orders by State

# Load orders and extract state
orders_df = cleaned["orders"]
orders_df["state"] = orders_df["shipping_address"].str.extract(r",\s([A-Z]{2})\s\d{5}")
orders_df = orders_df.dropna(subset=["state", "shipping_speed"])

# Calculate % of overnight shipping per state
state_shipping = (
    orders_df.groupby("state")
    .shipping_speed.value_counts(normalize=True)
    .rename("share")
    .reset_index()
)
overnight_share = state_shipping[state_shipping["shipping_speed"].str.lower() == "overnight"]

# Plot the map
fig = px.choropleth(
    overnight_share,
    locations="state",
    locationmode="USA-states",
    color="share",
    color_continuous_scale="Reds",
    scope="usa",
    title="📦 Percent of Orders Using Overnight Shipping by State",
    labels={"share": "% Overnight Shipping"}
)
fig.update_layout(
    geo=dict(bgcolor="rgba(0,0,0,0)"),
    coloraxis_colorbar=dict(title="% Overnight")
)
fig.show()

# Build visual of Sales and Refunds by Customer Type

# --- Prepare data ---
df_sales = views["monthly_channel_sales_returns"].copy()
df_segments = views["customer_segment_sales_returns"].copy()

# Clean and format
df_segments["is_guest"] = df_segments["is_guest"].astype(str).str.lower()
df_segments["customer_type"] = df_segments["is_guest"].map({
    "true": "Guest",
    "false": "Registered"
}).fillna("Registered")
df_segments["month"] = pd.to_datetime(df_segments["month"])
df_segments["month_label"] = df_segments["month"].dt.strftime("%b %Y")
df_segments = df_segments.dropna(subset=["total_sales", "percent_revenue_returned"])

# Month ordering
valid_months = df_segments["month_label"].dropna().unique()
ordered_months = sorted(valid_months, key=lambda x: pd.to_datetime(x))
df_segments = df_segments[df_segments["month_label"].isin(ordered_months)].copy()
df_segments["month_label"] = pd.Categorical(df_segments["month_label"], categories=ordered_months, ordered=True)

# --- Build subplots ---
fig = make_subplots(
    rows=2,
    cols=1,
    shared_xaxes=True,
    vertical_spacing=0.15,
    subplot_titles=(
        "Monthly Sales by Customer Type",
        "Refund % by Customer Type"
    )
)

# First subplot: 
# Grouped bar chart for total sales by customer type
for cust_type in df_segments["customer_type"].unique():
    cust_df = df_segments[df_segments["customer_type"] == cust_type].sort_values("month")
    fig.add_trace(
        go.Bar(
            x=cust_df["month_label"],
            y=cust_df["total_sales"],
            name=f"{cust_type} Sales",
            hovertemplate="%{x}<br>Total Sales: $%{y:,.0f}<br>Customer Type: " + cust_type + "<extra></extra>"
        ),
        row=1, col=1
    )

# Update layout for grouped bars
fig.update_layout(
    barmode="group",
    title="🛍️ Monthly Sales by Customer Type",
    xaxis=dict(title="Month", tickangle=-45),
    yaxis=dict(title="Total Sales ($)"),
    hovermode="x unified"
)

# Second subplot: refund % by customer type
line_colors = {
    "Guest": "darkorange",
    "Registered": "royalblue"
}

# Second subplot: refund % by customer type
for cust_type in df_segments["customer_type"].unique():
    cust_df = df_segments[df_segments["customer_type"] == cust_type].sort_values("month")
    fig.add_trace(
        go.Scatter(
            x=cust_df["month_label"],
            y=cust_df["percent_revenue_returned"],
            name=f"{cust_type} (% Refunded)",
            mode="lines+markers",
            line=dict(color=line_colors.get(cust_type, None), width=2),
            hovertemplate="%{x}<br>Refund %: %{y:.1f}%<br>Customer Type: " + cust_type + "<extra></extra>"
        ),
        row=2, col=1
    )

# Layout
fig.update_layout(
    title="Monthly Sales and Refund Rate by Customer Type",
    xaxis=dict(title="Month", tickangle=-45),
    yaxis=dict(title="Total Sales ($)", rangemode="tozero"),
    yaxis2=dict(title="% Revenue Refunded", rangemode="tozero"),
    height=700,
    hovermode="x unified",
    legend=dict(orientation="h", x=0.5, xanchor="center", y=-0.15)
)

fig.show()

# Build visual of MoM sales by CLV bucket

df = views["monthly_clv_sales_returns"].copy()
df["month"] = pd.to_datetime(df["month"], errors="coerce")
df = df.dropna(subset=["month", "clv_bucket", "total_sales"])
df["month_label"] = df["month"].dt.strftime("%B %Y")

fig = px.bar(
    df,
    x="month_label",
    y="total_sales",
    color="clv_bucket",
    barmode="group",
    title="Monthly Sales by CLV Bucket",
    labels={"month_label": "Month", "total_sales": "Total Sales ($)"}
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()

# Build visual of MoM refunds by CLV bucket (as grouped bar chart)

df = views["monthly_clv_sales_returns"].copy()
df["month"] = pd.to_datetime(df["month"], errors="coerce")
df = df.dropna(subset=["month", "clv_bucket", "percent_revenue_returned"])
df["month_label"] = df["month"].dt.strftime("%B %Y")

fig = px.bar(
    df,
    x="month_label",
    y="percent_revenue_returned",
    color="clv_bucket",
    barmode="group",
    title="📉 Monthly Refund % by CLV Bucket",
    labels={
        "month_label": "Month",
        "percent_revenue_returned": "% Revenue Refunded",
        "clv_bucket": "CLV Bucket"
    }
)
fig.update_layout(xaxis_tickangle=-45, hovermode="x unified")
fig.show()

# Build visual of MoM Sales by loyalty tier

df = views["monthly_loyalty_sales_returns"].copy()

# Ensure month is parsed properly
df["month"] = pd.to_datetime(df["month"], errors="coerce")

# Drop rows with bad or missing data
df = df.dropna(subset=["month", "loyalty_tier", "total_sales"])

# Format month label
df["month_label"] = df["month"].dt.strftime("%B %Y")

# Now plot
import plotly.express as px
fig = px.bar(
    df,
    x="month_label",
    y="total_sales",
    color="loyalty_tier",
    barmode="group",
    title="Monthly Sales by Loyalty Tier",
    labels={"month_label": "Month", "total_sales": "Total Sales ($)"}
)

fig.update_layout(xaxis_tickangle=-45)
fig.show()

# Build visual of MoM loyalty tier returns (as grouped bar chart)

df = views["monthly_loyalty_sales_returns"].copy()
df["month"] = pd.to_datetime(df["month"])
df = df.dropna(subset=["loyalty_tier", "percent_revenue_returned"])
df["month_label"] = df["month"].dt.strftime("%B %Y")

fig = px.bar(
    df,
    x="month_label",
    y="percent_revenue_returned",
    color="loyalty_tier",
    barmode="group",
    title="📉 Monthly Refund % by Loyalty Tier",
    labels={
        "month_label": "Month",
        "percent_revenue_returned": "% Revenue Refunded",
        "loyalty_tier": "Loyalty Tier"
    }
)
fig.update_layout(xaxis_tickangle=-45, hovermode="x unified")
fig.show()

# Cohort Size

# AVG return rate by Signup cohort

# Load and prepare cleaned data
customers = cleaned["customers"].copy()
orders = cleaned["orders"].copy()
returns = cleaned["returns"].copy()

# Standardize customer_id formats
customers["customer_id"] = customers["customer_id"].astype(str).str.strip()
orders["customer_id"] = orders["customer_id"].astype(str).str.strip()
returns["customer_id"] = returns["customer_id"].astype(str).str.strip()

# 📅 Extract signup quarter (drop missing dates = guests)
customers["signup_date"] = pd.to_datetime(customers["signup_date"], errors="coerce")
customers = customers.dropna(subset=["signup_date"])
customers["signup_quarter"] = customers["signup_date"].dt.to_period("Q").astype(str)

# 💰 Clean and aggregate total order value (fix string column)
orders["order_total"] = pd.to_numeric(orders["order_total"], errors="coerce")
clv = (
    orders.groupby("customer_id", as_index=False)["order_total"]
    .sum()
    .rename(columns={"order_total": "total_sales"})
)

# 💸 Aggregate total refunds per customer
returns["refunded_amount"] = pd.to_numeric(returns["refunded_amount"], errors="coerce")
refunds = (
    returns.groupby("customer_id", as_index=False)["refunded_amount"]
    .sum()
    .rename(columns={"refunded_amount": "total_refunds"})
)

# 🧬 Merge cohort with CLV and refunds
cohort_df = (
    customers[["customer_id", "signup_quarter"]]
    .merge(clv, on="customer_id", how="left")
    .merge(refunds, on="customer_id", how="left")
)

# 👥 Customer count by signup cohort
cohort_counts = (
    cohort_df.groupby("signup_quarter", as_index=False)["customer_id"]
    .nunique()
    .rename(columns={"customer_id": "num_customers"})
)

# 📊 Horizontal bar plot of cohort size
fig = px.bar(
    cohort_counts,
    x="num_customers",
    y="signup_quarter",
    orientation="h",
    title="👥 Number of Customers per Signup Cohort",
    labels={"signup_quarter": "Signup Quarter", "num_customers": "Customer Count"}
)
fig.update_layout(yaxis=dict(categoryorder="total ascending"))
fig.show()

# AVG return rate by Signup cohort

# 🧹 Clean nulls and calculate return rate
cohort_df["total_sales"] = cohort_df["total_sales"].fillna(0)
cohort_df["total_refunds"] = cohort_df["total_refunds"].fillna(0)
cohort_df["return_rate"] = cohort_df["total_refunds"] / cohort_df["total_sales"]
cohort_df.loc[~np.isfinite(cohort_df["return_rate"]), "return_rate"] = pd.NA

# 📊 Average return rate per signup cohort
summary = cohort_df.groupby("signup_quarter", as_index=False).agg(
    avg_return_rate=("return_rate", "mean")
)

# 📉 Horizontal bar plot of average return rate by signup cohort
fig = px.bar(
    summary,
    x="avg_return_rate",
    y="signup_quarter",
    orientation="h",
    title="🔁 Average Return Rate by Signup Cohort",
    labels={
        "signup_quarter": "Signup Quarter",
        "avg_return_rate": "Avg Return Rate"
    }
)
fig.update_layout(yaxis=dict(categoryorder="total ascending"), xaxis_tickformat=".1%")
fig.show()

# Sales by Signup Cohort

# 📦 Total sales by signup cohort
sales_by_cohort = (
    cohort_df.groupby("signup_quarter", as_index=False)["total_sales"]
    .sum()
    .rename(columns={"total_sales": "total_sales_usd"})
)

# 📊 Bar plot of sales by cohort
fig = px.bar(
    sales_by_cohort,
    x="signup_quarter",
    y="total_sales_usd",
    title="💵 Total Sales by Signup Cohort",
    labels={"signup_quarter": "Signup Quarter", "total_sales_usd": "Total Sales ($)"}
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()

# Total Refunds by Signup Cohort

# Ensure correct column types
returns["refunded_amount"] = pd.to_numeric(returns["refunded_amount"], errors="coerce")
customers["customer_id"] = customers["customer_id"].astype(str).str.strip()
returns["customer_id"] = returns["customer_id"].astype(str).str.strip()
customers["signup_date"] = pd.to_datetime(customers["signup_date"], errors="coerce")
customers["signup_quarter"] = customers["signup_date"].dt.to_period("Q").astype(str)

# Merge signup_quarter into returns
returns = returns.merge(customers[["customer_id", "signup_quarter"]], on="customer_id", how="left")

# Drop rows with missing or invalid signup cohort
returns_clean = returns.dropna(subset=["signup_quarter"])
returns_clean = returns_clean[returns_clean["signup_quarter"] != "NaT"]

# Group total refunds by signup cohort
refund_summary = (
    returns_clean.groupby("signup_quarter", as_index=False)["refunded_amount"]
    .sum()
    .rename(columns={"refunded_amount": "total_refunds_usd"})
)

# 📊 Plot total refunds by signup cohort
fig = px.bar(
    refund_summary,
    x="signup_quarter",
    y="total_refunds_usd",
    title="💸 Total Refunds by Signup Cohort",
    labels={"signup_quarter": "Signup Quarter", "total_refunds_usd": "Total Refunds ($)"}
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()

# 📦 Cohort Category Preferences: What Each Signup Cohort Bought Most in Their First Month

import pandas as pd
import plotly.express as px

# --- Load Cleaned Data ---
orders = cleaned["orders"].copy()
order_items = cleaned["order_items"].copy()
products = cleaned["product_catalog"].copy()
customers = cleaned["customers"].copy()

# --- Standardize IDs ---
for df in [orders, order_items, customers, products]:
    for col in df.columns:
        if "id" in col or col.endswith("_id"):
            df[col] = df[col].astype(str).str.strip()

# --- Convert Dates ---
orders["order_date"] = pd.to_datetime(orders["order_date"], errors="coerce")
customers["signup_date"] = pd.to_datetime(customers["signup_date"], errors="coerce")

orders["order_month"] = orders["order_date"].dt.to_period("M").astype(str)
customers["signup_month"] = customers["signup_date"].dt.to_period("M").astype(str)
customers["signup_quarter"] = customers["signup_date"].dt.to_period("Q").astype(str)

# --- Merge Orders, Items, Customers ---
merged = (
    order_items
    .merge(orders[["order_id", "customer_id", "order_month"]], on="order_id", how="left")
    .merge(customers[["customer_id", "signup_month", "signup_quarter"]], on="customer_id", how="left")
    .merge(products[["product_id", "category"]].rename(columns={"category": "product_category"}), on="product_id", how="left")
)

# --- Filter to Orders Placed in Signup Month ---
merged["quantity"] = pd.to_numeric(merged["quantity"], errors="coerce")

# Drop rows with missing signup cohort BEFORE filtering
merged = merged.dropna(subset=["signup_quarter"])

# Now filter to only orders made in the signup month
cohort_orders = merged[merged["order_month"] == merged["signup_month"]].copy()

# --- Aggregate + Rank ---
top_cats = (
    cohort_orders.groupby(["signup_quarter", "product_category"], as_index=False)["quantity"]
    .sum()
    .sort_values(["signup_quarter", "quantity"], ascending=[True, False])
)
top_cats["rank"] = top_cats.groupby("signup_quarter")["quantity"].rank(method="first", ascending=False)

# --- Filter Top 5 per Cohort ---
top5 = top_cats[top_cats["rank"] <= 5]
top5 = top5.dropna(subset=["signup_quarter"])

top5 = top5.dropna(subset=["signup_quarter"])
top5 = top5[top5["signup_quarter"] != "NaT"]

fig = px.bar(
    top5,
    x="quantity",
    y="product_category",  # <-- FIXED HERE
    color="signup_quarter",
    facet_row="signup_quarter",
    title="🏆 Top 5 Product Categories Purchased in Signup Month by Cohort",
    labels={"product_category": "Category", "quantity": "Units Sold"},
    orientation="h",
    height=1100
)

fig.update_layout(
    xaxis_title="Units Sold",
    yaxis_title=None,
    xaxis_tickformat=",",
    hovermode="closest"
)

fig.show()

# Build visual of Sales and Refunds by order channel

# Prepare data
channel_df = views["monthly_channel_sales_returns"].copy()
channel_df = channel_df.dropna(subset=["order_channel", "month", "total_sales", "percent_revenue_returned"]).copy()
channel_df["month"] = pd.to_datetime(channel_df["month"])
channel_df["month_label"] = channel_df["month"].dt.strftime("%b %Y")

# Order months for consistent x-axis
valid_months = channel_df["month_label"].dropna().unique()
ordered_months = sorted(valid_months, key=lambda x: pd.to_datetime(x))
channel_df["month_label"] = pd.Categorical(channel_df["month_label"], categories=ordered_months, ordered=True)

# Build subplots
fig = make_subplots(
    rows=2,
    cols=1,
    shared_xaxes=True,
    vertical_spacing=0.15,
    subplot_titles=(
        "Monthly Sales by Order Channel",
        "Refund % by Order Channel"
    )
)

# First subplot: total sales by order_channel (grouped bar)
for ch in channel_df["order_channel"].unique():
    ch_df = channel_df[channel_df["order_channel"] == ch].sort_values("month")
    fig.add_trace(
        go.Bar(
            x=ch_df["month_label"],
            y=ch_df["total_sales"],
            name=f"{ch.title()} Sales",
            hovertemplate="%{x}<br>Total Sales: $%{y:,.0f}<br>Channel: " + ch.title() + "<extra></extra>"
        ),
        row=1, col=1
    )

# Don't forget to set the barmode layout if not already defined:
fig.update_layout(
    barmode="group"  # Group bars side-by-side by month
)

# Second subplot: refund % by order_channel
for ch in channel_df["order_channel"].unique():
    ch_df = channel_df[channel_df["order_channel"] == ch].sort_values("month")
    fig.add_trace(
        go.Scatter(
            x=ch_df["month_label"],
            y=ch_df["percent_revenue_returned"],
            name=f"{ch.title()} (% Refunded)",
            mode="lines+markers",
            hovertemplate="%{x}<br>Refund %: %{y:.1f}%<br>Channel: " + ch.title() + "<extra></extra>"
        ),
        row=2, col=1
    )

# Final layout
fig.update_layout(
    title="Monthly Sales and Refund Rate by Order Channel",
    xaxis=dict(title="Month", tickangle=-45),
    yaxis=dict(title="Total Sales ($)", rangemode="tozero"),
    yaxis2=dict(title="% Revenue Refunded", rangemode="tozero"),
    height=700,
    hovermode="x unified",
    legend=dict(orientation="h", x=0.5, xanchor="center", y=-0.15)
)

fig.show()

# Build visual of MoM sales by payment method

# Load and prep data
df = views["monthly_payment_sales_returns"].copy()
df["month"] = pd.to_datetime(df["month"], errors="coerce")
df = df.dropna(subset=["payment_method", "total_sales", "month"]).copy()

# Format month label
df["month_label"] = df["month"].dt.strftime("%B %Y")  # e.g., July 2024

# Create bar chart
fig = px.bar(
    df,
    x="month_label",
    y="total_sales",
    color="payment_method",
    barmode="group",
    title="Monthly Sales by Payment Method",
    labels={
        "month_label": "Month",
        "total_sales": "Total Sales ($)",
        "payment_method": "Payment Method"
    },
    text_auto=".2s"
)

# Update layout
fig.update_layout(
    xaxis_tickangle=-45,
    hovermode="x unified",
    legend=dict(title=None, orientation="h", x=0.5, xanchor="center", y=-0.25)
)

fig.show()

# Build visual of return rate by payment method

# Load and prep data
df = views["monthly_payment_sales_returns"].copy()
df["month"] = pd.to_datetime(df["month"])
df = df.dropna(subset=["payment_method", "percent_revenue_returned"])

# Create bar chart
fig = px.bar(
    df,
    x="month",
    y="percent_revenue_returned",
    color="payment_method",
    barmode="group",
    title="Monthly Refund % by Payment Method",
    labels={
        "month": "Month",
        "percent_revenue_returned": "% Revenue Refunded",
        "payment_method": "Payment Method"
    },
    text_auto=".1f"
)

fig.update_layout(
    hovermode="x unified",
    legend=dict(title=None, orientation="h", x=0.5, xanchor="center", y=-0.25)
)

fig.show()

# Build a view of the Top 10 Customers by Total Refunded

df = views["top_customers_by_returns"].copy()
df = df.sort_values("total_refunded", ascending=False)

# Optional: add abbreviated return rate formatting
df["return_rate_pct"] = df["return_rate"].round(1).astype(str) + "%"

# Display top 10 with selected columns
cols = [
    "customer_id", "clv_bucket", "loyalty_tier", "signup_channel",
    "total_orders", "total_sales", "total_returns", "total_refunded", "avg_return_value", "return_rate_pct"
]
df[cols].head(10)

# Build visual of Top 10 by Singup Channel

fig = px.bar(
    df.head(10),
    x="customer_id",
    y="total_refunded",
    color="signup_channel",
    hover_data=["clv_bucket", "total_sales", "total_returns", "return_rate"],
    title="Top 10 Customers by Total Refunds - (Colored by Signup Channel)"
)
fig.update_layout(xaxis_title="Customer ID", yaxis_title="Total Refunded ($)", xaxis_tickangle=-45)
fig.show()

# Build visual of Top 10 by with mapped colors by return rate %

fig = px.bar(
    df.head(10),
    x="customer_id",
    y="total_refunded",
    color="return_rate",  # heatmap-style color
    hover_data=["clv_bucket", "loyalty_tier", "signup_channel", "total_sales", "total_returns"],
    title="Top 10 Customers by Total Refunded (Shaded by Return Rate)"
)
fig.update_layout(coloraxis_colorbar_title="% Return Rate", xaxis_tickangle=-45)
fig.show()

# Sort by return rate and show top 15
views["return_rate_by_product"].sort_values(by="return_rate_percent", ascending=False).head(15)

# Visualize return reasons breakdown

df_reasons = views["return_reason_summary"].copy()
fig = px.bar(df_reasons.sort_values("total_refunded", ascending=False),
             x="normalized_reason", y="total_refunded",
             title="Total Refunded Amount by Return Reason")
fig.show()

# Visualize MoM sales vs returns by catagory

# Filter to 12-month range
df = views["category_monthly_sales_returns"].copy()
df["month"] = pd.to_datetime(df["month"], format="%Y-%m")
df = df[(df["month"] >= "2024-07-01") & (df["month"] <= "2025-06-30")]

# Group by month for total sales (to layer behind)
monthly_sales = df.groupby("month")["total_sales"].sum().reset_index()

# Create base figure
fig = go.Figure()

# Add transparent sales bar chart
fig.add_trace(go.Bar(
    x=monthly_sales["month"],
    y=monthly_sales["total_sales"],
    name="Total Sales (right axis)",
    opacity=0.4,
    marker_color="rgba(100, 149, 237, 0.4)",  # cornflowerblue with transparency
    yaxis="y2"
))

# Add line chart traces per product category
for category in df["product_category"].unique():
    cat_data = df[df["product_category"] == category]
    fig.add_trace(go.Scatter(
        x=cat_data["month"],
        y=cat_data["percent_revenue_lost"],
        mode="lines+markers",
        name=f"{category} (return %)",
    ))

# Add secondary y-axis for sales
fig.update_layout(
    title="Return Rate by Category (Jul 2024 – Jun 2025) with Monthly Sales Volume",
    xaxis_title="Month",
    yaxis_title="Percent Revenue Lost",
    yaxis2=dict(
        title="Total Sales ($)",
        overlaying="y",
        side="right",
        showgrid=False
    ),
    barmode="overlay",
    legend=dict(
    x=0.5,
    y=-0.2,
    xanchor='center',
    yanchor='top',
    orientation="h",
    bgcolor='rgba(255,255,255,0.8)',
    bordercolor='lightgray',
    borderwidth=1
),
    hovermode="x unified"
)

fig.show()

# 'Build Quality Flags' and Aggragate buy "return_reason"

# Extract the view
df = views["return_rate_by_product"].copy()

# Normalize return_reason just in case
df["return_reason"] = df["return_reason"].str.lower().str.strip()

# Define quality-related reasons
quality_reasons = [
    "defective",
    "arrived damaged",
    "product did not match description",
    "damaged in transit",
    "missing parts"
]

# Tag rows as quality-related
df["quality_flag"] = df["return_reason"].isin(quality_reasons)

# Filter to only quality-related return reasons
quality_issues = df[df["quality_flag"]].sort_values("return_rate_percent", ascending=False)

# Show top 15 products
quality_issues.head(15)

# Heatmap of Quality-Flagged Returns by Total Refund Value

fig = px.bar(
    quality_issues,
    x="total_refunded",
    y="return_reason",
    orientation="h",
    color="return_rate_percent",
    text="return_count",
    title="Quality-Related Return Reasons by Total Refunds",
    labels={
        "total_refunded": "Total Refunded ($)",
        "return_reason": "Return Reason",
        "return_rate_percent": "Avg Return Rate (%)",
        "return_count": "Returned Items"
    },
    color_continuous_scale="blues"
)

fig.update_layout(yaxis=dict(categoryorder="total ascending"))
fig.show()

# Top 20 quality-flagged products by refund amount

# Define quality-related reasons
quality_reasons = [
    "defective",
    "arrived damaged",
    "product did not match description",
    "damaged in transit",
    "missing parts"
]

# Normalize and flag
df["return_reason"] = df["return_reason"].str.lower().str.strip()
df["quality_flag"] = df["return_reason"].isin(quality_reasons)

# Group summary stats (from full data)
grouped = (
    df.groupby("product_name")
    .agg(
        total_refunded=("total_refunded", "sum"),
        total_returns=("return_count", "sum"),
        avg_return_rate=("return_rate_percent", "mean"),
    )
)

# Compute top return reason per product (quality flagged only)
def top_reason_info(sub_df):
    reasons = sub_df["return_reason"].tolist()
    counts = Counter(reasons)
    top_reason, top_count = counts.most_common(1)[0]
    total = sub_df["return_count"].sum()
    ratio = top_count / total if total > 0 else 0
    label = f"{top_reason} ({top_count}/{total}, {ratio:.0%})"
    dominant = "⛔️" if ratio > 0.5 else "✅"
    return pd.Series({
        "top_return_reason": f"{label} {dominant}",
        "dominant_reason_ratio": ratio,
        "is_strongly_dominant": ratio > 0.5
    })

# Apply top reason logic on quality-flagged returns only
reason_info = (
    df[df["quality_flag"]]
    .groupby("product_name", group_keys=False)
    .apply(top_reason_info, include_groups=False)
    .reset_index()
)

# Compute quality dominance over total returns
def quality_dominance_info(sub_df):
    total = sub_df["return_count"].sum()
    quality_count = sub_df[sub_df["quality_flag"]]["return_count"].sum()
    ratio = quality_count / total if total > 0 else 0

    # Tiered flag logic
    if ratio > 0.5:
        flag = "⛔️ High Risk"
    elif ratio > 0.33:
        flag = "⚠️ Moderate Risk"
    else:
        flag = "🟢 Low Risk"

    return pd.Series({
        "quality_return_pct": ratio,
        "quality_flag_label": flag,
        "is_quality_dominant": ratio > 0.5  # Retain original binary if needed
    })

# Apply to full dataset
dominance_info = (
    df.groupby("product_name", group_keys=False)
    .apply(quality_dominance_info, include_groups=False)
    .reset_index()
)

# Final merge and top 20 slice
top_products = (
    reason_info
    .merge(dominance_info, on="product_name")
    .merge(grouped.reset_index(), on="product_name")
    .sort_values("total_refunded", ascending=False)
    .head(20)
)

# Plot
fig = px.bar(
    top_products,
    x="total_refunded",
    y="product_name",
    orientation="h",
    color="avg_return_rate",
    text="total_returns",
    title="Top 20 Products Driving Refund Costs — with Quality Risk Flags",
    labels={
        "total_refunded": "Total Refunded ($)",
        "product_name": "Product",
        "avg_return_rate": "Avg Return Rate (%)",
        "total_returns": "Return Count",
        "top_return_reason": "Top Quality Flag",
        "quality_flag_label": "Return Type Summary"
    },
    color_continuous_scale="blues",
    hover_data=["top_return_reason", "quality_return_pct", "quality_flag_label"]
)

fig.update_layout(
    yaxis=dict(
        categoryorder="total ascending",
        tickmode="linear"  # Ensures all ticks are shown
    )
)
fig.show()

# Export revised dataset and additonal data

EXPORT_DIR = "exports/vp_req_analysis"
os.makedirs(EXPORT_DIR, exist_ok=True)

# Example: export top risk products
top_products.to_csv(os.path.join(EXPORT_DIR, "top_20_quality_risk_products.csv"), index=False)

# Optional: export final modeling table
df_product_risk = views["return_rate_by_product"].copy()
df_product_risk.to_csv(os.path.join(EXPORT_DIR, "product_quality_risk_summary.csv"), index=False)

Table	Description
`orders`	Transaction-level data with dates, totals, channel, and payment method
`order_items`	Product-level detail for each order
`returns`	Return events including refund amounts and reasons
`return_items`	Line-level product returns tied to return IDs
`customers`	Customer demographics, loyalty tier, signup date
`product_catalog`	Product metadata including name, category, and price

View Name	Description
`monthly_signup_channel_sales_returns`	Return and sales trends by signup cohort and channel
`top_customers_by_returns`	Highest refund customers by volume
`monthly_payment_sales_returns`	MoM performance by payment method
`return_reason_summary`	Aggregated reasons for returns
`region_summary_by_state`	Sales and return breakdown by state/region
`monthly_sales_returns_summary`	High-level monthly trends of orders, returns, refunds
`monthly_clv_sales_returns`	CLV trends across time, including return-adjusted value
`monthly_loyalty_sales_returns`	Return behavior by loyalty tier
`customer_segment_sales_returns`	Return behavior by key customer segments
`return_rate_by_product`	Return count and percentage by product
`shipping_return_impact_summary`	Return likelihood based on shipping type
`category_monthly_sales_returns`	Sales/return trends across product categories
`monthly_channel_sales_returns`	Return trends grouped by order channel

	region	state_code	order_month	total_orders	total_sales	total_returns	total_refunds	percent_revenue_refunded
598	Unknown	_a	NaN	1	0.00	0	0.00	NaN
599	Unknown	fm	2024-07	6	23245.47	3	8010.07	34.46
600	Unknown	fm	2024-08	10	38923.01	3	3706.02	9.52
601	Unknown	fm	2024-09	10	32672.72	3	7053.79	21.59
602	Unknown	fm	2024-10	11	64688.24	4	9878.23	15.27
603	Unknown	fm	2024-11	11	55842.13	3	8914.93	15.96
604	Unknown	fm	2024-12	12	69668.19	2	6949.09	9.97
605	Unknown	fm	2025-01	6	31708.38	1	972.08	3.07
606	Unknown	fm	2025-02	11	42821.84	1	874.34	2.04
607	Unknown	fm	2025-03	10	51007.58	3	12381.77	24.27
608	Unknown	fm	2025-04	8	36091.94	3	2907.35	8.06
609	Unknown	fm	2025-05	14	40610.38	1	1220.79	3.01
610	Unknown	fm	2025-06	10	38075.27	2	12254.72	32.19
611	Unknown	fm	2025-07	5	25930.39	2	6341.66	24.46
612	Unknown	mh	2024-07	5	21009.59	2	3694.47	17.58
613	Unknown	mh	2024-08	20	105970.73	9	30413.73	28.70
614	Unknown	mh	2024-09	12	68076.97	3	5764.44	8.47
615	Unknown	mh	2024-10	19	85992.51	6	17490.03	20.34
616	Unknown	mh	2024-11	20	97181.22	5	15418.39	15.87
617	Unknown	mh	2024-12	23	85770.07	7	18592.93	21.68
618	Unknown	mh	2025-01	16	61111.36	5	10106.71	16.54
619	Unknown	mh	2025-02	14	53441.38	5	10871.88	20.34
620	Unknown	mh	2025-03	17	77491.05	6	24311.62	31.37
621	Unknown	mh	2025-04	15	55310.71	2	5823.33	10.53
622	Unknown	mh	2025-05	20	84579.56	8	28944.27	34.22
623	Unknown	mh	2025-06	14	64608.99	3	2994.95	4.64
624	Unknown	mh	2025-07	8	33000.44	4	6262.57	18.98
625	Unknown	pw	2024-07	7	37090.83	2	5707.93	15.39
626	Unknown	pw	2024-08	21	98575.06	6	14274.94	14.48
627	Unknown	pw	2024-09	19	67206.41	8	8685.62	12.92
628	Unknown	pw	2024-10	16	74622.84	4	14034.11	18.81
629	Unknown	pw	2024-11	21	89165.59	4	17808.05	19.97
630	Unknown	pw	2024-12	19	77254.06	4	16867.67	21.83
631	Unknown	pw	2025-01	19	95637.61	7	27337.04	28.58
632	Unknown	pw	2025-02	19	86149.69	9	31265.30	36.29
633	Unknown	pw	2025-03	21	79741.49	7	14873.86	18.65
634	Unknown	pw	2025-04	9	38194.35	2	8646.99	22.64
635	Unknown	pw	2025-05	19	77705.94	7	13049.09	16.79
636	Unknown	pw	2025-06	21	79692.01	7	12952.49	16.25
637	Unknown	pw	2025-07	8	37667.11	1	322.62	0.86

Shipping Speed	Total Sales	Refunds Issued	% Revenue Refunded	Order Count
Standard	$32,792,079.09	$6,844,296.08	20.87%	7,775
Two-Day	$9,922,715.29	$2,159,967.54	21.77%	2,376
Overnight	$7,464,937.81	$1,605,040.99	21.50%	1,794

	customer_id	clv_bucket	loyalty_tier	signup_channel	total_orders	total_sales	total_returns	total_refunded	avg_return_value	return_rate_pct
0	CUST-4220	high	platinum	email	27	117084.17	13	56313.68	4331.82	48.1%
1	CUST-3691	high	platinum	phone	19	105383.54	8	44595.38	5574.42	42.3%
2	CUST-4451	high	gold	website	31	138863.44	13	44237.89	3402.91	31.9%
3	CUST-4993	high	platinum	social media	24	102667.34	11	43790.91	3980.99	42.6%
4	CUST-4897	high	platinum	website	31	132980.89	11	41390.80	3762.80	31.1%
5	CUST-4053	high	platinum	social media	17	85005.68	8	40381.13	5047.64	47.5%
6	CUST-3786	high	platinum	phone	25	122933.45	11	39953.47	3632.13	32.5%
7	CUST-4552	high	gold	website	29	130651.28	11	38913.39	3537.58	29.8%
8	CUST-3847	high	gold	website	30	121577.93	10	38372.95	3837.30	31.6%
9	CUST-5206	high	platinum	social media	14	66463.12	7	37737.19	5391.03	56.8%

🧭 Project Context & Dataset Overview¶

📊 Sales vs. Returns Diagnostic Report and Analysis¶

📌 Executive Summary: Q3 2025 Sales & Returns Diagnostic¶

📚 Table of Contents¶

📁 Data Overview¶

📥 Setup & Load Data¶

📊 Overall Sales and Refund Health: Initial Performance Snapshot¶

🗺️ Regional Performance: Sales vs. Return Rate Analysis¶

🚚 Shipping Speed Impact: Evaluating Return Likelihood¶

🔀 Segment Performance: Loyalty Tiers and CLV Buckets¶

📦 Channel & Payment Method Diagnostic¶

🚨 High-Risk & Emerging Reseller Behavior¶

📈 Return Trends by Category and Reason¶

🚩 Product Return Risk: Quality Flag System Deep Dive¶

📤 Dataframe Export¶

✅ Closing Note¶

	product_id	product_name	return_reason	product_category	order_count	return_count	total_refunded	avg_refund	return_rate_percent
1572	product_id	product_name	reason	category	1	1	0.00	0.00	100.00
4813	697	cozy table	found a better price	home	37	7	4750.25	678.61	18.92
0	652	classic anthology	changed mind	books	36	6	4042.09	673.68	16.67
6384	1221	colorful blocks	changed mind	toys	50	8	2403.96	300.50	16.00
1	1095	classic memoir	changed mind	books	51	8	751.59	93.95	15.69
1573	1007	durable jacket	changed mind	clothing	58	9	4692.87	521.43	15.52
1574	197	stylish sweater	changed mind	clothing	59	9	9715.86	1079.54	15.25
3181	149	portable monitor	changed mind	electronics	53	8	5058.90	632.36	15.09
1575	526	classic sweater	changed mind	clothing	67	10	2332.98	233.30	14.93
1576	364	classic sweater	changed mind	clothing	47	7	4609.93	658.56	14.89
1577	565	durable shirt	wrong item	clothing	48	7	1363.50	194.79	14.58
1578	1042	classic jeans	changed mind	clothing	48	7	4576.20	653.74	14.58
6385	964	fun blocks	found a better price	toys	49	7	3946.40	563.77	14.29
1579	79	durable sweater	changed mind	clothing	42	6	5472.06	912.01	14.29
4814	116	rustic chair	changed mind	home	42	6	5418.90	903.15	14.29

	product_id	product_name	return_reason	product_category	order_count	return_count	total_refunded	avg_refund	return_rate_percent	quality_flag
3182	583	portable headphones	defective	electronics	50	7	5110.56	730.08	14.00	True
4815	350	cozy lamp	product did not match description	home	43	6	532.27	88.71	13.95	True
4816	703	modern chair	defective	home	44	6	2024.76	337.46	13.64	True
3187	729	smart monitor	defective	electronics	61	8	1694.23	211.78	13.11	True
3188	866	wireless camera	defective	electronics	46	6	3125.88	520.98	13.04	True
3189	1251	smart camera	defective	electronics	54	7	5396.27	770.90	12.96	True
4818	849	elegant chair	product did not match description	home	47	6	331.50	55.25	12.77	True
3190	923	wireless monitor	defective	electronics	47	6	246.40	41.07	12.77	True
4	886	illustrated guide	arrived damaged	books	47	6	3582.00	597.00	12.77	True
3191	692	portable camera	arrived damaged	electronics	48	6	936.60	156.10	12.50	True
6387	1198	interactive puzzle	defective	toys	48	6	8092.56	1348.76	12.50	True
1588	378	classic sweater	defective	clothing	49	6	1753.50	292.25	12.24	True
3193	570	wireless speaker	defective	electronics	49	6	3850.44	641.74	12.24	True
3192	342	smart camera	product did not match description	electronics	49	6	4200.02	700.00	12.24	True
3195	1208	portable camera	defective	electronics	58	7	4588.38	655.48	12.07	True

🧭 Project Context & Dataset Overview¶

📊 Sales vs. Returns Diagnostic Report and Analysis¶

📌 Executive Summary: Q3 2025 Sales & Returns Diagnostic¶

📚 Table of Contents¶

📁 Data Overview¶

📥 Setup & Load Data¶

📊 Overall Sales and Refund Health: Initial Performance Snapshot¶

🗺️ Regional Performance: Sales vs. Return Rate Analysis¶

🚚 Shipping Speed Impact: Evaluating Return Likelihood¶

🔀 Segment Performance: Loyalty Tiers and CLV Buckets¶

🧬 Signup Cohort Analysis: Growth, Value, and Return Behavior¶

📦 Channel & Payment Method Diagnostic¶

🚨 High-Risk & Emerging Reseller Behavior¶

📈 Return Trends by Category and Reason¶

🚩 Product Return Risk: Quality Flag System Deep Dive¶

📤 Dataframe Export¶

✅ Closing Note¶