import kagglehub
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

pio.renderers.default = 'notebook_connected' # Ensure plots are displayed in the notebook

from kagglehub import KaggleDatasetAdapter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

df = kagglehub.dataset_load(KaggleDatasetAdapter.PANDAS, 'anirudhchauhan/retail-store-inventory-forecasting-dataset', 'retail_store_inventory.csv')

print('Basic Information:')
print(df.info())

print(f'Descriptive Statistics:\n{df.describe()}')

print(f'Duplicate Rows: {df.duplicated().sum()}')
print(f'Missing Values: {df.isnull().sum().sum()}')

print("\nUnique Values per Column:")
for column in df.columns:
    print(f"{column}: {df[column].nunique()} unique values")

Basic Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73100 entries, 0 to 73099
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                73100 non-null  object 
 1   Store ID            73100 non-null  object 
 2   Product ID          73100 non-null  object 
 3   Category            73100 non-null  object 
 4   Region              73100 non-null  object 
 5   Inventory Level     73100 non-null  int64  
 6   Units Sold          73100 non-null  int64  
 7   Units Ordered       73100 non-null  int64  
 8   Demand Forecast     73100 non-null  float64
 9   Price               73100 non-null  float64
 10  Discount            73100 non-null  int64  
 11  Weather Condition   73100 non-null  object 
 12  Holiday/Promotion   73100 non-null  int64  
 13  Competitor Pricing  73100 non-null  float64
 14  Seasonality         73100 non-null  object 
dtypes: float64(3), int64(5), object(7)
memory usage: 8.4+ MB
None
Descriptive Statistics:
       Inventory Level    Units Sold  Units Ordered  Demand Forecast  \
count     73100.000000  73100.000000   73100.000000     73100.000000   
mean        274.469877    136.464870     110.004473       141.494720   
std         129.949514    108.919406      52.277448       109.254076   
min          50.000000      0.000000      20.000000        -9.990000   
25%         162.000000     49.000000      65.000000        53.670000   
50%         273.000000    107.000000     110.000000       113.015000   
75%         387.000000    203.000000     155.000000       208.052500   
max         500.000000    499.000000     200.000000       518.550000   

              Price      Discount  Holiday/Promotion  Competitor Pricing  
count  73100.000000  73100.000000       73100.000000        73100.000000  
mean      55.135108     10.009508           0.497305           55.146077  
std       26.021945      7.083746           0.499996           26.191408  
min       10.000000      0.000000           0.000000            5.030000  
25%       32.650000      5.000000           0.000000           32.680000  
50%       55.050000     10.000000           0.000000           55.010000  
75%       77.860000     15.000000           1.000000           77.820000  
max      100.000000     20.000000           1.000000          104.940000  
Duplicate Rows: 0
Missing Values: 0

Unique Values per Column:
Date: 731 unique values
Store ID: 5 unique values
Product ID: 20 unique values
Category: 5 unique values
Region: 4 unique values
Inventory Level: 451 unique values
Units Sold: 498 unique values
Units Ordered: 181 unique values
Demand Forecast: 31608 unique values
Price: 8999 unique values
Discount: 5 unique values
Weather Condition: 4 unique values
Holiday/Promotion: 2 unique values
Competitor Pricing: 9751 unique values
Seasonality: 4 unique values

df['Date'] = pd.to_datetime(df['Date'])
df['Revenue'] = (df['Price'] - (df['Price'] * df['Discount'] / 100)) * df['Units Sold']

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73100 entries, 0 to 73099
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                73100 non-null  datetime64[ns]
 1   Store ID            73100 non-null  object        
 2   Product ID          73100 non-null  object        
 3   Category            73100 non-null  object        
 4   Region              73100 non-null  object        
 5   Inventory Level     73100 non-null  int64         
 6   Units Sold          73100 non-null  int64         
 7   Units Ordered       73100 non-null  int64         
 8   Demand Forecast     73100 non-null  float64       
 9   Price               73100 non-null  float64       
 10  Discount            73100 non-null  int64         
 11  Weather Condition   73100 non-null  object        
 12  Holiday/Promotion   73100 non-null  int64         
 13  Competitor Pricing  73100 non-null  float64       
 14  Seasonality         73100 non-null  object        
 15  Revenue             73100 non-null  float64       
dtypes: datetime64[ns](1), float64(4), int64(5), object(6)
memory usage: 8.9+ MB
None
        Date Store ID Product ID     Category Region  Inventory Level  \
0 2022-01-01     S001      P0001    Groceries  North              231   
1 2022-01-01     S001      P0002         Toys  South              204   
2 2022-01-01     S001      P0003         Toys   West              102   
3 2022-01-01     S001      P0004         Toys  North              469   
4 2022-01-01     S001      P0005  Electronics   East              166   

   Units Sold  Units Ordered  Demand Forecast  Price  Discount  \
0         127             55           135.47  33.50        20   
1         150             66           144.04  63.01        20   
2          65             51            74.02  27.99        10   
3          61            164            62.18  32.72        10   
4          14            135             9.26  73.64         0   

  Weather Condition  Holiday/Promotion  Competitor Pricing Seasonality  \
0             Rainy                  0               29.69      Autumn   
1             Sunny                  0               66.16      Autumn   
2             Sunny                  1               31.32      Summer   
3            Cloudy                  1               34.74      Autumn   
4             Sunny                  0               68.95      Summer   

    Revenue  
0  3403.600  
1  7561.200  
2  1637.415  
3  1796.328  
4  1030.960

# Group data by date and calculate total demand (assuming a 'Demand' column exists)
demand_trends = df.groupby('Date')['Demand Forecast'].sum()

# Plot the demand trends 
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=demand_trends.index, 
    y=demand_trends.values, 
    mode='lines', 
    name='Daily Demand', 
    line=dict(color='#324fff'),
    hovertemplate=(
        "Date: %{x}<br>"
        "Total Demand: %{y}<br>"
        "Min Demand: %{customdata[0]:.2f}<br>"
        "Mean Demand: %{customdata[1]:.2f}<br>"
        "Median Demand: %{customdata[2]:.2f}<br>"
        "Max Demand: %{customdata[3]:.2f}<extra></extra>"
    ),
    customdata=np.stack((
        [demand_trends.min()] * len(demand_trends),
        [demand_trends.mean()] * len(demand_trends),
        [demand_trends.median()] * len(demand_trends),
        [demand_trends.max()] * len(demand_trends)
    ), axis=-1)
))
fig.update_layout(
    title='Demand Trends Over Time',
    xaxis_title='Date',
    yaxis_title='Total Demand',
    template='plotly_white',
)
fig.show()

# Analyze demand by week & month
df['Week'] = df['Date'].dt.isocalendar().week
df['Month'] = df['Date'].dt.month

weekly_demand = df.groupby('Week')['Demand Forecast'].sum()
monthly_demand = df.groupby('Month')['Demand Forecast'].sum()

# Plot weekly demand 
fig_weekly = px.bar(x=weekly_demand.index, y=weekly_demand.values, labels={'x': 'Week', 'y': 'Total Demand'}, title='Weekly Demand')
fig_weekly.update_traces(marker=dict(color=weekly_demand.values, colorscale='viridis', colorbar=dict(title='Total Demand')))
fig_weekly.update_layout(template='plotly_white')
fig_weekly.show()

# Plot monthly demand
fig_monthly = px.bar(x=monthly_demand.index, y=monthly_demand.values, labels={'x': 'Month', 'y': 'Total Demand'}, title='Monthly Demand')
fig_monthly.update_traces(marker=dict(color=monthly_demand.values, colorscale='viridis', colorbar=dict(title='Total Demand')))
fig_monthly.update_layout(template='plotly_white')
fig_monthly.show()

## Competitor Pricing Impact

# Check if 'Competitor Pricing' column exists
if 'Competitor Pricing' in df.columns:
    # Correlation Analysis
    print("\nCorrelation between Competitor Price and Demand:")
    print(df[['Competitor Pricing', 'Demand Forecast']].corr())

    # Scatter Plot
    fig_competitor = px.scatter(
        df, 
        x='Competitor Pricing', 
        y='Demand Forecast', 
        title='Demand vs Competitor Price',
        labels={'Competitor Pricing': 'Competitor Pricing', 'Demand Forecast': 'Demand Forecast'},
        opacity=0.6
    )
    fig_competitor.update_traces(marker=dict(size=8, color=df['Demand Forecast'], colorscale='Viridis', colorbar=dict(title='Demand Forecast')))
    fig_competitor.update_layout(template='plotly_white')
    fig_competitor.show()

    # Price Difference Analysis
    df['Price Difference'] = df['Price'] - df['Competitor Pricing']
    print("\nCorrelation between Price Difference and Demand:")
    print(df[['Price Difference', 'Demand Forecast']].corr())

    # Scatter Plot for Price Difference
    fig_price_diff = px.scatter(
        df, 
        x='Price Difference', 
        y='Demand Forecast', 
        title='Demand vs Price Difference',
        labels={'Price Difference': 'Price Difference (Your Price - Competitor Price)', 'Demand Forecast': 'Demand Forecast'},
        opacity=0.6
    )
    fig_price_diff.update_traces(marker=dict(size=8, color=df['Demand Forecast'], colorscale='Viridis', colorbar=dict(title='Demand Forecast')))
    fig_price_diff.update_layout(template='plotly_white')
    fig_price_diff.update_traces(
        hovertemplate='Price Difference=%{x}<br>Demand Forecast=%{y}<br>Mean Price=%{customdata[0]:.2f}<br>Mean Competitor Price=%{customdata[1]:.2f}<extra></extra>',
        customdata=np.stack((df['Price'], df['Competitor Pricing']), axis=-1)
    )
    fig_price_diff.show()

else:
    print("The dataset does not contain a 'Competitor Pricing' column.")

Correlation between Competitor Price and Demand:
                    Competitor Pricing  Demand Forecast
Competitor Pricing            1.000000         0.000844
Demand Forecast               0.000844         1.000000

Correlation between Price Difference and Demand:
                  Price Difference  Demand Forecast
Price Difference           1.00000         -0.00157
Demand Forecast           -0.00157          1.00000

## Discount Strategies

# Check if 'Discount' column exists
if 'Discount' in df.columns:
    
    # Correlation Analysis
    print("\nCorrelation between Discount and Demand Forecast:")
    print(df[['Discount', 'Demand Forecast']].corr())
    # Box Plot 
    fig_discount_box = px.box(
        df, 
        x='Discount', 
        y='Demand Forecast', 
        points='all', 
        title='Demand Forecast Distribution by Discount Level',
        labels={'Discount': 'Discount (%)', 'Demand Forecast': 'Demand Forecast'}
    )
    fig_discount_box.update_traces(marker=dict(size=5, opacity=0.7))
    fig_discount_box.update_layout(template='plotly_white')
    fig_discount_box.show()

    # Revenue Simulation
    df['Revenue'] = (df['Price'] - (df['Price'] * df['Discount'] / 100)) * df['Demand Forecast']
    avg_revenue_by_discount = df.groupby('Discount')['Revenue'].mean()

    print("Average Revenue by Discount Level:")
    print(avg_revenue_by_discount)

    # Bar Plot 
    fig_revenue_bar = px.bar(
        avg_revenue_by_discount, 
        x=avg_revenue_by_discount.index, 
        y=avg_revenue_by_discount.values, 
        title='Average Revenue by Discount Level',
        labels={'x': 'Discount (%)', 'y': 'Average Revenue'}
    )
    fig_revenue_bar.update_traces(marker=dict(color=avg_revenue_by_discount.values, colorscale='Viridis', colorbar=dict(title='Average Revenue')))
    fig_revenue_bar.update_layout(template='plotly_white', xaxis_title='Discount (%)', yaxis_title='Average Revenue')
    fig_revenue_bar.update_traces(texttemplate='$%{y:,.2f}', textposition='outside')
    fig_revenue_bar.show()

else:
    print("The dataset does not contain a 'Discount' column.")

Correlation between Discount and Demand Forecast:
                 Discount  Demand Forecast
Discount         1.000000         0.002338
Demand Forecast  0.002338         1.000000

Average Revenue by Discount Level:
Discount
0     7737.608111
5     7442.234830
10    6984.070860
15    6645.119698
20    6293.065972
Name: Revenue, dtype: float64

# Check if required columns exist
required_columns = ['Price', 'Competitor Pricing', 'Discount', 'Demand Forecast']
if all(col in df.columns for col in required_columns):
    # Prepare the data
    X = df[['Price', 'Competitor Pricing', 'Discount']]
    y = df['Demand Forecast']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")

    # Display coefficients
    coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
    print("\nModel Coefficients:")
    print(coefficients)

else:
    print("The dataset does not contain all required columns: 'Price', 'Competitor Pricing', 'Discount', 'Demand Forecast'.")

Mean Squared Error: 11923.807856940673
R-squared: -0.000288284820934015

Model Coefficients:
              Feature  Coefficient
0               Price     0.081647
1  Competitor Pricing    -0.084639
2            Discount     0.055434

## Simulate Revenue Maximization

# Check if required columns exist
required_columns = ['Price', 'Competitor Pricing', 'Discount', 'Demand Forecast']
if all(col in df.columns for col in required_columns):
    # Define a range of price adjustments
    price_adjustments = [-0.2, -0.1, 0, 0.1, 0.2]  # Adjustments in percentage (-20%, -10%, 0%, +10%, +20%)
    simulation_results = []

    for adjustment in price_adjustments:
        # Adjust prices
        df['Adjusted Price'] = df['Price'] * (1 + adjustment)

        # Simulate revenue with adjusted prices
        df['Simulated Revenue'] = (df['Adjusted Price'] - (df['Adjusted Price'] * df['Discount'] / 100)) * df['Demand Forecast']

        # Calculate total revenue
        total_revenue = df['Simulated Revenue'].sum()
        simulation_results.append({'Price Adjustment (%)': adjustment * 100, 'Total Revenue': total_revenue})

    # Convert results to a DataFrame
    simulation_df = pd.DataFrame(simulation_results)

    # Plot the simulation results
    fig_simulation = px.bar(
        simulation_df,
        x='Price Adjustment (%)',
        y='Total Revenue',
        title='Revenue Simulation for Different Price Adjustments',
        labels={'Price Adjustment (%)': 'Price Adjustment (%)', 'Total Revenue': 'Total Revenue'},
        text='Total Revenue',
        color='Price Adjustment (%)',
        color_continuous_scale=px.colors.sequential.Viridis
    )
    fig_simulation.update_traces(texttemplate='$%{text:.2s}', textposition='outside')
    fig_simulation.update_layout(template='plotly_white', xaxis_title='Price Adjustment (%)', yaxis_title='Total Revenue')
    fig_simulation.update_yaxes(tick0=0, dtick=1e8, tickprefix="$")
    fig_simulation.update_traces(
        hovertemplate='Price Adjustment (%)=%{x}<br>Total Revenue=$%{y:,.2f}<extra></extra>'
    )
    fig_simulation.show()

    print("Simulation Results:")
    simulation_df['Total Revenue'] = simulation_df['Total Revenue'].apply(lambda x: f"${x:,.2f}")
    print(simulation_df)

else:
    print("The dataset does not contain all required columns: 'Price', 'Competitor Pricing', 'Discount', 'Demand Forecast'.")

Simulation Results:
   Price Adjustment (%)    Total Revenue
0                 -20.0  $410,515,243.84
1                 -10.0  $461,829,649.32
2                   0.0  $513,144,054.80
3                  10.0  $564,458,460.29
4                  20.0  $615,772,865.77

Table of Contents

Dynamic Pricing Strategy¶

Importing the Required Libraries¶

Loading the dataset from Kaggle¶

Exploratory Data Analysis¶

Insights from the Output¶

Data Wrangling¶

Analyzing Demand Trends¶

Insights from Data Trends¶

Competitor Pricing Impact¶

Discount Strategies¶

Developing the Pricing Model¶

Simulate Revenue Maximization¶

Conclusion¶