# 📁 Load Configuration and Set Execution Context

from analyst_toolkit.m00_utils.config_loader import load_config

# Path to master config (modify if needed)
config_path = "config/run_toolkit_config.yaml"

# Load full configuration dictionary
config = load_config(config_path)

# Extract run-level settings
run_id = config.get("run_id", "default_run")
notebook_mode = config.get("notebook", True)

print(f"🔧 Config loaded | Run ID: {run_id} | Notebook Mode: {notebook_mode}")

🔧 Config loaded | Run ID: CLI_2_QA | Notebook Mode: True

# 📥 Load Raw Data from CSV

from analyst_toolkit.m00_utils.load_data import load_csv

# Load input path from the global config (or override manually)
input_path = config.get("pipeline_entry_path", "data/raw/synthetic_penguins_v3.5.csv")
print(f"📂 Loading data from: {input_path}")

# Load into DataFrame
df_raw = load_csv(input_path)

📂 Loading data from: data/raw/synthetic_penguins_v3.5.csv

# 📊 M01: Data Diagnostics – Profile Structure & Shape

from analyst_toolkit.m00_utils.config_loader import load_config
from analyst_toolkit.m01_diagnostics.run_diag_pipeline import run_diag_pipeline

# --- Load module-specific config ---
diag_config_full = load_config("config/diag_config_template.yaml")

# --- Run Diagnostics Module ---
# We pass the df_raw loaded in the previous step.
# The global run_id and notebook_mode are used.
df_profiled = run_diag_pipeline(
    config=diag_config_full, # Pass the full config object
    df=df_raw,
    notebook=notebook_mode,
    run_id=run_id
)

Accordion(children=(VBox(children=(HTML(value="<h3 style='margin-top:10px'>Visual Profile</h3>"), HBox(childre…

# 🛡️ M02: Schema & Content Validation – First Audit Pass

from analyst_toolkit.m00_utils.config_loader import load_config
from analyst_toolkit.m02_validation.run_validation_pipeline import run_validation_pipeline

# --- Load module-specific config ---
val_config_full = load_config("config/validation_config_template.yaml")

# --- Run Validation Module ---
df_validated = run_validation_pipeline(
    config=val_config_full,
    df=df_profiled,
    notebook=notebook_mode,
    run_id=run_id
)

# 🧹 M03: Data Normalization – Standardizing Key Fields

import logging
from analyst_toolkit.m00_utils.config_loader import load_config
from analyst_toolkit.m03_normalization.run_normalization_pipeline import run_normalization_pipeline

# --- Load Config ---
norm_config_full = load_config("config/normalization_config_template.yaml")

# --- Run Normalization Module ---
# Uses df_validated from the previous step and global run_id/notebook_mode.
df_normalized = run_normalization_pipeline(
    config=norm_config_full,
    df=df_validated,
    notebook=notebook_mode,
    run_id=run_id
)

# 🛡️ M02: Certification (Strict Validation Gatekeeper)

import logging
from analyst_toolkit.m00_utils.config_loader import load_config
from analyst_toolkit.m02_validation.run_validation_pipeline import run_validation_pipeline

# --- Load Certification Config ---
cert_config_full = load_config("config/certification_config_template.yaml")

# --- Run Final Certification Pass ---
logging.info("🚀 Starting Certification Gate (re-using M02)")

df_certified = run_validation_pipeline(
    config=cert_config_full,
    df=df_normalized,
    notebook=notebook_mode,
    run_id=run_id
)

# ♻️ M04: Deduplication and Duplicates Handling

from analyst_toolkit.m00_utils.config_loader import load_config
from analyst_toolkit.m04_duplicates.run_dupes_pipeline import run_duplicates_pipeline
import logging

# --- Load Config ---
dupes_config_full = load_config("config/dups_config_template.yaml")


# --- Run Duplicates Module ---
df_deduped = run_duplicates_pipeline(
    config=dupes_config_full,
    df=df_certified,
    notebook=notebook_mode,
    run_id=run_id
)

Accordion(children=(VBox(children=(HTML(value="<h3 style='margin-top:10px'>Visual Summary</h3>"), HBox(childre…

# 📏 M05: Detect Outliers and Plot Visuals

import logging
from analyst_toolkit.m00_utils.config_loader import load_config
from analyst_toolkit.m05_detect_outliers.run_detection_pipeline import run_outlier_detection_pipeline
from IPython.display import display

# --- Load module-specific config ---
outlier_config_full = load_config("config/outlier_config_template.yaml")

# The 'df_deduped' variable should be the output from your M04 Duplicates module
if 'df_deduped' in locals():
    df_outliers_flagged, detection_results = run_outlier_detection_pipeline(
        config=outlier_config_full,
        df=df_deduped,
        notebook=notebook_mode,
        run_id=run_id
    )

Accordion(children=(VBox(children=(HTML(value="<h3 style='margin-top:10px'>Outlier Visualizations</h3>"), HBox…

# 🧼 M06: Handle Outliers

import logging
from analyst_toolkit.m00_utils.config_loader import load_config
from analyst_toolkit.m06_outlier_handling.run_handling_pipeline import run_outlier_handling_pipeline

# --- Load module-specific config ---
handling_config_full = load_config("config/handling_config_template.yaml")

# Pass the entire detection_results dictionary, not its unpacked components.
df_handled = run_outlier_handling_pipeline(
    config=handling_config_full,
    df=df_outliers_flagged,
    detection_results=detection_results, # Pass the whole dictionary here
    notebook=notebook_mode,
    run_id=run_id
)

#🔧 M07: Impute Data and Plot Summary Visuals

import logging
from analyst_toolkit.m00_utils.config_loader import load_config
from analyst_toolkit.m07_imputation.run_imputation_pipeline import run_imputation_pipeline

# Load the configuration for the imputation module
imputation_config_full = load_config("config/imputation_config_template.yaml")

df_imputed = run_imputation_pipeline(
    config=imputation_config_full,
    notebook=notebook_mode,
    df=df_handled,  # Pass the existing DataFrame here
    run_id=run_id
)

Accordion(children=(VBox(children=(HTML(value="<h3 style='margin-top:10px'>Imputation Visualizations</h3>"), H…

# 🎬 M10: Final Auditing and Certifaction 

from analyst_toolkit.m10_final_audit.final_audit_pipeline import run_final_audit_pipeline
from analyst_toolkit.m00_utils.config_loader import load_config

# --- Load Config ---
final_audit_config_full = load_config("config/final_audit_config_template.yaml")

# --- Run Final Audit ---
# The final audit pipeline expects the full config dictionary, as it may perform
# validation using rules from a separate block.
df_final_clean = run_final_audit_pipeline(
    config=final_audit_config_full,
    df=df_imputed,  # Pass the existing DataFrame here
    notebook=notebook_mode,
    run_id=run_id
)

Column	Unique Values
tag_id	2678
capture_date	1917
date_egg	1656
colony_id	19

Column	Dtype	Unique Values	Audit Remarks	Missing Count	Missing %
tag_id	object	2678	✅ OK	2242	40.46
species	object	5	✅ OK	166	3.00
bill length (mm)	float64	1984	✅ OK	429	7.74
bill depth (mm)	float64	862	✅ OK	417	7.53
flipper_length_mm	float64	1466	✅ OK	451	8.14
body_mass_g	float64	3328	✅ OK	406	7.33
age_group	object	7	✅ OK	121	2.18
sex	object	6	✅ OK	2739	49.43
colony_id	object	19	✅ OK	405	7.31
island	object	11	✅ OK	584	10.54
capture_date	object	1917	✅ OK	534	9.64
health_status	object	9	✅ OK	554	10.00
study_name	object	12	✅ OK	563	10.16
clutch_completion	object	2	✅ OK	463	8.36
date_egg	object	1656	✅ OK	836	15.09

Metric	count	mean	std	min	25%	50%	75%	max	skew	kurtosis
bill length (mm)	5112.0	45.166682	5.666410	30.63	40.51	45.950	49.360	62.64	-0.145952	-0.606829
bill depth (mm)	5124.0	17.305377	2.231495	12.37	15.49	17.485	19.030	23.01	-0.111456	-0.897492
flipper_length_mm	5090.0	202.237800	14.342621	162.79	191.10	199.315	214.100	252.40	0.329099	-0.616376
body_mass_g	5135.0	3853.645265	898.232986	2376.56	3219.50	3742.000	4376.515	7378.33	0.616778	0.086446

tag_id	species	bill length (mm)	bill depth (mm)	flipper_length_mm	body_mass_g	age_group	sex	colony_id	island	capture_date	health_status	study_name	clutch_completion	date_egg
NaN	Gentoo	48.99	14.11	220.9	5890.0	Adult	Male	Torgersen North	Torgersen	2023-11-17	NaN	PAPRI2023	Yes	2023-11-09
NaN	Gentoo	48.99	14.11	220.9	5890.0	Adult	Male	Torgersen North	Torgersen	2023-11-17	NaN	PAPRI2023	Yes	2023-11-09

tag_id	species	bill length (mm)	bill depth (mm)	flipper_length_mm	body_mass_g	age_group	sex	colony_id	island	capture_date	health_status	study_name	clutch_completion	date_egg
NaN	Gentoo	48.99	14.11	220.9	5890.0	Adult	Male	Torgersen North	Torgersen	2023-11-17	NaN	PAPRI2023	Yes	2023-11-09
NaN	Gentoo	48.99	14.11	220.9	5890.0	Adult	Male	Torgersen North	Torgersen	2023-11-17	NaN	PAPRI2023	Yes	2023-11-09
ADE-0001	Adelie	39.55	19.92	186.2	2500.0	Chick	Male	Biscoe West	Biscoe	2024-13-03	Underweight	PAPRI2022	Yes	2022-07-20
NaN	Gentoo	48.23	13.00	NaN	4536.0	Adult	Female	Biscoe West	NaN	2024-04-14	Healthy	NaN	Yes	2024-04-12
GEN-0001	Gentoo	46.22	13.91	212.8	2500.0	Juvenile	Female	Dream South	Dream	NaN	Underweight	PAPRI2020	Yes	2020-04-14

Validation Rule	Description	Status
Schema Conformity	Verify column names match the expected schema.	⚠️ Fail (2 issues)
Dtype Enforcement	Verify column data types match expectations.	⚠️ Fail (1 issues)
Categorical Values	Verify values in categorical columns are within an allowed set.	⚠️ Fail (7 issues)
Numeric Ranges	Verify values in numeric columns are within a defined range.	✅ Pass

Issue Type	Columns
Missing	bill_length_mm, bill depth_mm
Unexpected	bill depth (mm), bill length (mm)

Invalid Value	Count
cormorant NW	45
invalid_colony	36
Torgersen	35
Cormorant	34
biscoe 2	34
torgersen SE	31
TORGERSEN 4	30
short point	28
/Shortcut	26
Biscoe	25
dream island	24
Unknown	24
Dream Island	22
dream	19

Invalid Value	Count
PAPR12021	60
papri2024	58
STUDY_2022	57
PP2020	48
PAPR2023	46
PAPRI20X9	37

Column	Original	Corrected	Score
species	Gentto	Gentoo	83
species	adeleie	Adelie	92
island	bisco	Biscoe	91
island	short cut	Shortcut	94
island	dreamland	Dream	90
island	cormor	Cormorant	90
island	torg	Torgersen	90

Value	Count
Torgersen	1405
Dream	1184
Biscoe	1084
Cormorant	715
NaN	584
Shortcut	510
UNKNOWN	59

Value	Original Count	Normalized Count
Torgersen	1344	1405
Dream	1138	1184
Biscoe	1029	1084
Cormorant	668	715
NaN	584	584
Shortcut	440	510
short cut	70	0
torg	61	0
unknown	59	0
bisco	55	0
cormor	47	0
dreamland	46	0
UNKNOWN	0	59

Value	Original Count	Normalized Count
Chinstrap	1776	1776
Gentoo	1670	1815
Adelie	1636	1784
NaN	166	166
adeleie	148	0
Gentto	145	0

Value	Count
Healthy	2194
Underweight	1411
Overweight	733
NaN	554
Critical	323
Sick	296
UNKNOWN	30

Value	Count
Torgersen North	1490
Dream South	1216
Biscoe West	1092
Cormorant East	767
Shortcut Point	511
NaN	405
UNKNOWN	60

Invalid Value	Count
juvenille	58
unk	48
ADLT	47
chik	29

Invalid Value	Count
critcal ill	36
Overwight	34
under weight	33
ok	30

Column	Operation
clutch_completion	standardize_text
sex	standardize_text

Value	Count
NaN	2739
MALE	1369
FEMALE	1310
UNKNOWN	123

Value	Original Count	Normalized Count
Torgersen North	1394	1490
Dream South	1151	1216
Biscoe West	1033	1092
Cormorant East	688	767
Shortcut Point	457	511
NaN	405	405
cormorant NW	45	0
invalid_colony	36	0
Torgersen	35	0
Cormorant	34	0
biscoe 2	34	0
torgersen SE	31	0
TORGERSEN 4	30	0
short point	28	0
/Shortcut	26	0
Biscoe	25	0
Unknown	24	0
dream island	24	0
Dream Island	22	0
dream	19	0

Value	Original Count	Normalized Count
Adult	3775	3822
Juvenile	1015	1073
Chick	448	477
NaN	121	121
juvenille	58	0
unk	48	0
ADLT	47	0
chik	29	0
UNKNOWN	0	48

Value	Count
PAPRI2020	1122
PAPRI2021	1024
PAPRI2022	916
PAPRI2023	824
PAPRI2024	803
NaN	563
PAPRI2019	252
UNKNOWN	37

Value	Original Count	Normalized Count
PAPRI2020	1074	1122
PAPRI2021	964	1024
PAPRI2022	859	916
PAPRI2023	778	824
PAPRI2024	745	803
NaN	563	563
PAPRI2019	252	252
PAPR12021	60	0
papri2024	58	0
STUDY_2022	57	0
PP2020	48	0
PAPR2023	46	0
PAPRI20X9	37	0
UNKNOWN	0	37

Value	Count
NaT	915
2023-01-18	10
2024-05-09	10
2024-02-01	9
2023-06-12	8
2020-12-25	8
2022-11-15	8
2023-06-10	8
2023-03-22	8
2024-01-01	8
2022-08-04	8
2022-12-03	8
2024-06-19	8
2023-09-27	7
2022-09-28	7
2022-09-27	7
2023-10-22	7
2024-04-25	7
2023-07-25	7
2023-08-24	7

Value	Original Count	Normalized Count
NaN	534	915
9999-99-99	39	0
error	33	0
not-a-date	30	0
2023-01-18	10	10
2024-05-09	10	10
2024-02-01	9	9
2020-12-25	8	8
2022-08-04	8	8
2022-11-15	8	8
2022-12-03	8	8
2023-03-22	8	8
2023-06-10	8	8
2023-06-12	8	8
2024-01-01	8	8
2024-06-19	8	8
2020-07-02	7	7
2021-01-21	7	7
2022-01-09	7	7
2022-09-27	7	7

Value	Count
NaT	836
2019-12-11	13
2019-12-27	12
2020-10-11	11
2020-07-20	11
2019-12-17	11
2019-11-25	11
2020-06-25	11
2021-04-03	10
2021-04-16	10
2023-10-08	10
2021-07-05	9
2022-10-26	9
2021-01-06	9
2022-07-13	9
2022-02-07	9
2020-01-22	9
2021-08-30	9
2020-09-20	9
2020-01-17	9

column	method	outlier_count	lower_bound	upper_bound	outlier_examples
bill_length_mm	iqr	1	27.235000	62.635000	[62.64]
body_mass_g	zscore	18	709.829815	6997.460715	[7000.0, 7000.0, 7000.0, 7000.0, 7000.0]

tag_id	species	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	age_group	sex	colony_id	island	capture_date	health_status	study_name	clutch_completion	date_egg	is_duplicate
NaN	Gentoo	NaN	14.41	221.90	7000.00	Adult	NaN	Torgersen North	Torgersen	2019-10-31	Healthy	PAPRI2019	NaN	NaT	False
NaN	NaN	47.68	17.62	NaN	7000.00	Adult	NaN	Torgersen North	Torgersen	2021-08-17	Healthy	PAPRI2021	NaN	2021-08-14	False
GEN-0041	Gentoo	45.63	14.13	213.20	7000.00	Juvenile	FEMALE	Dream South	Dream	2021-12-02	Healthy	PAPRI2021	NaN	2021-11-23	False
NaN	Gentoo	46.39	13.84	206.30	7000.00	Adult	NaN	Cormorant East	Cormorant	2022-10-26	Healthy	PAPRI2022	NaN	2022-10-12	False
ADE-0182	Adelie	38.46	17.16	185.10	7000.00	Adult	NaN	Dream South	Dream	2024-02-03	Overweight	PAPRI2024	yes	2024-01-31	False
NaN	Gentoo	49.36	13.00	224.10	7000.00	Adult	NaN	Torgersen North	Torgersen	NaT	Healthy	NaN	no	NaT	True
NaN	Gentoo	40.59	14.37	230.00	7000.00	Adult	MALE	NaN	Biscoe	NaT	Healthy	PAPRI2021	yes	2021-03-25	True
GEN-0301	Gentoo	44.56	16.48	212.70	7000.00	Adult	MALE	Biscoe West	Biscoe	2022-12-12	Healthy	PAPRI2022	no	NaT	False
NaN	Gentoo	45.16	15.57	218.40	7000.00	Adult	FEMALE	NaN	Cormorant	2021-07-30	Healthy	PAPRI2021	yes	2021-07-17	False
GEN-0681	Gentoo	44.73	13.94	217.80	7000.00	Adult	NaN	Torgersen North	Torgersen	NaT	Healthy	PAPRI2022	yes	2022-11-07	True
GEN-0706	Gentoo	45.74	14.02	217.80	7000.00	Adult	NaN	Dream South	Dream	2024-02-28	Healthy	PAPRI2024	yes	2024-02-21	False
GEN-0743	Gentoo	49.05	14.49	213.20	7000.00	Adult	FEMALE	Dream South	Dream	NaT	Healthy	PAPRI2020	yes	2020-03-17	False
CHN-0860	Chinstrap	50.88	18.49	206.10	7000.00	Adult	NaN	Cormorant East	Cormorant	2024-07-09	Overweight	PAPRI2023	yes	2023-11-16	False
GEN-0974	Gentoo	50.57	15.89	220.00	7000.00	Adult	NaN	Torgersen North	NaN	2021-01-05	NaN	PAPRI2021	yes	2020-12-26	False
GEN-0681	Gentoo	47.77	13.84	222.73	7378.33	Adult	NaN	Torgersen North	Torgersen	NaT	Overweight	PAPRI2022	yes	2022-11-07	True
NaN	Chinstrap	51.63	18.69	212.94	7128.38	Adult	FEMALE	Torgersen North	Torgersen	2022-03-25	Overweight	PAPRI2020	NaN	2020-03-12	False
NaN	Gentoo	47.71	13.93	236.20	7085.98	Adult	NaN	Torgersen North	Torgersen	NaT	Critical	NaN	no	NaT	True
CHN-0219	Chinstrap	62.64	18.00	204.26	2770.38	Juvenile	NaN	Torgersen North	UNKNOWN	2020-10-22	Critical	PAPRI2019	yes	2019-10-16	False
NaN	Gentoo	NaN	14.99	219.59	7128.48	Adult	NaN	Torgersen North	Torgersen	2021-10-31	Healthy	PAPRI2019	NaN	NaT	False

strategy	column	outliers_handled	details
clip	bill_length_mm	1	Clipped 1 values to bounds.
median	body_mass_g	18	Imputed 18 values with median (3742.00).

🧪 Analyst Toolkit Tutorial: Full Data Pipeline¶

🧰 Toolkit Architecture: 3-Way Modular Design¶

🧪 Step 1: Run Initial Diagnostics (M01)¶

🔷 Shape

🧠 Memory Usage

♻️ Duplicate Summary

🔢 High Cardinality

📚 Full Data Profile

🔢 Descriptive Statistics

📋 First 5 Rows (.head)

🛡️ Step 2: Run Schema & Content Validation (M02)¶

Failure Details

🧹 Step 3: Normalize & Standardize Data (M03)¶

✏️ Columns Renamed (2)

🧹 Strings Cleaned (2)

📅 Datetimes Parsed (2)

🧩 Values Mapped (7)

🤖 Fuzzy Matches (7)

Column: sex

Column: island

Column: species

Column: health_status

Column: colony_id

Column: age_group

Column: study_name

Column: capture_date

Column: date_egg

Column: clutch_completion

🛡️ Step 4: Certification Gate (M02)¶

🧹 Step 5: Deduplication (M04)¶

📏 Step 6: Detect Outliers (M05)¶

🧼 Step 7: Handle Outliers (M06)¶

🔧 Step 8: Impute Missing Values (M07)¶

📋 Imputation Actions Log

🔍 Null Value Audit

Column: sex

Column: tag_id

Column: species

Column: age_group

Column: colony_id

Column: island

Column: study_name

Column: clutch_completion

Column: health_status

🧩 Behind the Scenes: Utility & Visual Modules¶

🧰 m00_utils/¶

📊 m08_visuals/¶

🎬 Step 9: Final Auditing and Certifaction (M10)¶

🚦 Failures Schema Conformity

📊 Pipeline Status

🛠️ Final Edits Log

🧬 Data Lifecycle

📚 Data Dictionary / Schema

🧭 What’s Next?¶

Column: `sex`

Column: `island`

Column: `species`

Column: `health_status`

Column: `colony_id`

Column: `age_group`

Column: `study_name`

Column: `capture_date`

Column: `date_egg`

Column: `clutch_completion`

Column: `sex`

Column: `tag_id`

Column: `species`

Column: `age_group`

Column: `colony_id`

Column: `island`

Column: `study_name`

Column: `clutch_completion`

Column: `health_status`

🧰 `m00_utils/`¶

📊 `m08_visuals/`¶

Column	Strategy	Fill Value	Nulls Filled
bill_length_mm	mean	45.17	429
body_mass_g	mean	3842.08	406
bill_depth_mm	median	17.48	417
flipper_length_mm	median	199.31	451
sex	mode	MALE	2739
tag_id	constant	UNKNOWN	2242
species	constant	UNKNOWN	166
age_group	constant	UNKNOWN	121
colony_id	constant	UNKNOWN	405
island	constant	UNKNOWN	584
study_name	constant	UNKNOWN	563
capture_date	constant	1900-01-01 00:00:00	915
date_egg	constant	1900-01-01 00:00:00	836
clutch_completion	constant	UNKNOWN	463
health_status	constant	UNKNOWN	554

Value	Count
UNKNOWN	2242
GEN-0271	5
ADE-0119	4
GEN-0143	4
ADE-0176	4
GEN-0751	4
GEN-0673	4
GEN-0433	4
GEN-0902	4
GEN-0106	4

Value	Original Count	Imputed Count	Change
NaN	2242	0	-2242
GEN-0271	5	5	0
ADE-0119	4	4	0
ADE-0176	4	4	0
ADE-0203	4	4	0
CHN-0905	4	4	0
GEN-0054	4	4	0
GEN-0106	4	4	0
GEN-0143	4	4	0
GEN-0433	4	4	0

Metric	Value
Final Pipeline Status	❌ CERTIFICATION FAILED
Certification Rules Passed	False
Null Value Audit Passed	True

Metric	Value
Initial Rows	5541
Final Rows	5541
Initial Columns	15
Final Columns	16

Column	Dtype	Unique Values	Audit Remarks
tag_id	object	2679	✅ OK
species	object	4	✅ OK
bill_length_mm	float64	1985	✅ OK
bill_depth_mm	float64	863	✅ OK
flipper_length_mm	float64	1467	✅ OK
body_mass_g	float64	3324	✅ OK
age_group	object	4	✅ OK
sex	object	3	✅ OK
colony_id	object	6	✅ OK
island	object	6	✅ OK
capture_date	datetime64[ns]	1746	✅ OK
health_status	object	6	✅ OK
study_name	object	7	✅ OK
clutch_completion	object	3	✅ OK
date_egg	datetime64[ns]	1657	✅ OK
is_duplicate	bool	2	✅ OK

Metric	count	mean	std	min	25%	50%	75%	max	skew	kurtosis
bill_length_mm	5541.0	45.166681	5.442593	30.63	40.98	45.240	49.07	62.635000	-0.151954	-0.405922
bill_depth_mm	5541.0	17.318895	2.146392	12.37	15.65	17.485	18.92	23.010000	-0.134672	-0.725335
flipper_length_mm	5541.0	201.999903	13.769645	162.79	191.80	199.315	213.00	252.400000	0.392703	-0.397019
body_mass_g	5541.0	3842.084375	845.336672	2376.56	3264.00	3806.000	4266.00	6965.072934	0.552218	0.015302