# # 📚 Portfolio Project: Global Causes of Death — Exploratory Analysis & Interactive Visualizations
# **Author:** Adika Maulana 
# **Goal:** produce a clean, reproducible EDA + interactive visualizations suitable for inclusion in a portfolio.
#
# **Dataset:** `deathcause.csv` (expected columns include `Country/Territory`, `Code` (ISO3), `Year`, and many cause columns where each column is deaths for that cause).
#
# Notes:
# - Paste and run cells sequentially in Jupyter / Colab.
# - If using Colab, enable `ipywidgets` with `!jupyter nbextension enable --py widgetsnbextension` if necessary.
# - Replace `[Your Name]` with your real name for the portfolio.

# %% [markdown]
# ## 1) Setup & Imports
# We'll use pandas for data, Plotly for interactive charts, scikit-learn for clustering, and ipywidgets for small interactive controls.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from IPython.display import display, Markdown
import ipywidgets as widgets
import warnings
warnings.filterwarnings("ignore")

# Load the dataset
df = pd.read_csv("deathcause.csv")

# Display first few rows to confirm structure
print("Data loaded successfully. Shape:", df.shape)
display(df.head())

Data loaded successfully. Shape: (6120, 34)

# -- 3. BASIC INFO
print("\n--- Dataset Info ---")
df.info()

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6120 entries, 0 to 6119
Data columns (total 34 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   Country/Territory                           6120 non-null   object
 1   Code                                        6120 non-null   object
 2   Year                                        6120 non-null   int64 
 3   Meningitis                                  6120 non-null   int64 
 4   Alzheimer's Disease and Other Dementias     6120 non-null   int64 
 5   Parkinson's Disease                         6120 non-null   int64 
 6   Nutritional Deficiencies                    6120 non-null   int64 
 7   Malaria                                     6120 non-null   int64 
 8   Drowning                                    6120 non-null   int64 
 9   Interpersonal Violence                      6120 non-null   int64 
 10  Maternal Disorders                          6120 non-null   int64 
 11  HIV/AIDS                                    6120 non-null   int64 
 12  Drug Use Disorders                          6120 non-null   int64 
 13  Tuberculosis                                6120 non-null   int64 
 14  Cardiovascular Diseases                     6120 non-null   int64 
 15  Lower Respiratory Infections                6120 non-null   int64 
 16  Neonatal Disorders                          6120 non-null   int64 
 17  Alcohol Use Disorders                       6120 non-null   int64 
 18  Self-harm                                   6120 non-null   int64 
 19  Exposure to Forces of Nature                6120 non-null   int64 
 20  Diarrheal Diseases                          6120 non-null   int64 
 21  Environmental Heat and Cold Exposure        6120 non-null   int64 
 22  Neoplasms                                   6120 non-null   int64 
 23  Conflict and Terrorism                      6120 non-null   int64 
 24  Diabetes Mellitus                           6120 non-null   int64 
 25  Chronic Kidney Disease                      6120 non-null   int64 
 26  Poisonings                                  6120 non-null   int64 
 27  Protein-Energy Malnutrition                 6120 non-null   int64 
 28  Road Injuries                               6120 non-null   int64 
 29  Chronic Respiratory Diseases                6120 non-null   int64 
 30  Cirrhosis and Other Chronic Liver Diseases  6120 non-null   int64 
 31  Digestive Diseases                          6120 non-null   int64 
 32  Fire, Heat, and Hot Substances              6120 non-null   int64 
 33  Acute Hepatitis                             6120 non-null   int64 
dtypes: int64(32), object(2)
memory usage: 1.6+ MB

print("\n--- Missing values ---")
print(df.isna().sum().sort_values(ascending=False).head(10))

--- Missing values ---
Country/Territory                          0
Code                                       0
Year                                       0
Meningitis                                 0
Alzheimer's Disease and Other Dementias    0
Parkinson's Disease                        0
Nutritional Deficiencies                   0
Malaria                                    0
Drowning                                   0
Interpersonal Violence                     0
dtype: int64

# Identify columns
id_cols = ["Country/Territory", "Code", "Year"]
cause_cols = [c for c in df.columns if c not in id_cols]
print(f"\nDetected {len(cause_cols)} cause columns.")

Detected 31 cause columns.

# -- 4. TRANSFORM TO LONG FORMAT
df_long = df.melt(id_vars=id_cols, value_vars=cause_cols,
                  var_name="Cause", value_name="Deaths")

# Convert to numeric
df_long["Deaths"] = pd.to_numeric(df_long["Deaths"], errors="coerce").fillna(0)

print("\nLong-format preview:")
df_long.head()

Long-format preview:

# -- 5. GLOBAL TOTALS BY CAUSE
cause_totals = df_long.groupby("Cause")["Deaths"].sum().sort_values(ascending=False)
top10 = cause_totals.head(10)

plt.figure(figsize=(10,6))
sns.barplot(x=top10.values, y=top10.index, palette="Reds_r")
plt.title("Top 10 Global Causes of Death", fontsize=14)
plt.xlabel("Total Deaths")
plt.ylabel("Cause")
plt.tight_layout()
plt.show()

print("\n--- Top 10 causes of death ---")
display(top10)

--- Top 10 causes of death ---

Cause
Cardiovascular Diseases                       447741982
Neoplasms                                     229758538
Chronic Respiratory Diseases                  104605334
Lower Respiratory Infections                   83770038
Neonatal Disorders                             76860729
Diarrheal Diseases                             66235508
Digestive Diseases                             65638635
Tuberculosis                                   45850603
Cirrhosis and Other Chronic Liver Diseases     37479321
HIV/AIDS                                       36364419
Name: Deaths, dtype: int64

# -- 6. YEARLY TREND FOR TOP 6 CAUSES
top6_causes = top10.index[:6]
trend = df_long[df_long["Cause"].isin(top6_causes)]
trend_yearly = trend.groupby(["Year","Cause"])["Deaths"].sum().reset_index()

plt.figure(figsize=(12,6))
sns.lineplot(data=trend_yearly, x="Year", y="Deaths", hue="Cause", marker="o")
plt.title("Yearly Death Trends for Top 6 Causes", fontsize=14)
plt.tight_layout()
plt.show()

# -- 7. TOP COUNTRIES FOR LEADING CAUSE
top_cause = top10.index[0]
country_top_cause = df_long[df_long["Cause"] == top_cause].groupby("Country/Territory")["Deaths"].sum().sort_values(ascending=False).head(10)

plt.figure(figsize=(10,6))
sns.barplot(x=country_top_cause.values, y=country_top_cause.index, palette="Blues_r")
plt.title(f"Top 10 Countries - {top_cause}", fontsize=14)
plt.xlabel("Total Deaths")
plt.tight_layout()
plt.show()

print(f"\nTop 10 countries for {top_cause}:")
display(country_top_cause)

Top 10 countries for Cardiovascular Diseases:

Country/Territory
China            100505973
India             52994710
Russia            33903781
United States     26438346
Indonesia         13587011
Ukraine           13053052
Germany           10819770
Brazil             9589019
Japan              9210437
Pakistan           7745192
Name: Deaths, dtype: int64

# Melt / reshape dataframe (long format)
df_long = df.melt(
    id_vars=["Country/Territory", "Code", "Year"],
    var_name="Cause",
    value_name="Deaths"
)

# Convert Deaths to numeric (if any missing/invalid data)
df_long["Deaths"] = pd.to_numeric(df_long["Deaths"], errors="coerce").fillna(0)

print("Data reshaped successfully. Shape:", df_long.shape)
display(df_long.head())

Data reshaped successfully. Shape: (189720, 5)

# -- 8. Compute linear slope (yearly trend) for each cause
trend_stats = []
for cause in cause_cols:
    subset = df_long[df_long["Cause"] == cause].groupby("Year")["Deaths"].sum().reset_index()
    if subset["Deaths"].sum() > 0:
        X = subset["Year"].values.reshape(-1,1)
        y = subset["Deaths"].values
        model = LinearRegression().fit(X,y)
        slope = model.coef_[0]
        trend_stats.append((cause, slope))

trend_df = pd.DataFrame(trend_stats, columns=["Cause","Trend_Slope"]).sort_values("Trend_Slope", ascending=False)
print("\n--- Top 5 increasing trends ---")
display(trend_df.head(5))

print("\n--- Top 5 decreasing trends ---")
display(trend_df.tail(5))

--- Top 5 increasing trends ---

--- Top 5 decreasing trends ---

# Visualization
plt.figure(figsize=(10,6))
sns.barplot(data=trend_df.head(10), x="Trend_Slope", y="Cause", palette="Greens_r")
plt.title("Causes with Strongest Increasing Trends", fontsize=14)
plt.xlabel("Slope (Deaths per Year)")
plt.tight_layout()
plt.show()

# -- 9. Cluster causes by total deaths and trend slope
merge_df = pd.merge(cause_totals.reset_index(), trend_df, on="Cause")
merge_df.columns = ["Cause","Total_Deaths","Trend_Slope"]

X = merge_df[["Total_Deaths","Trend_Slope"]]
X_scaled = (X - X.mean()) / X.std()

kmeans = KMeans(n_clusters=3, random_state=42).fit(X_scaled)
merge_df["Cluster"] = kmeans.labels_

plt.figure(figsize=(8,6))
sns.scatterplot(data=merge_df, x="Total_Deaths", y="Trend_Slope",
                hue="Cluster", palette="Set2", s=100)
plt.title("KMeans Clusters of Death Causes")
plt.xlabel("Total Deaths")
plt.ylabel("Trend Slope")
plt.tight_layout()
plt.show()

from IPython.display import Markdown

insights = """
### 📊 Key Insights
1. **Top global causes** such as Cardiovascular Diseases dominate worldwide mortality, indicating chronic conditions as key contributors.
2. **Yearly trend analysis** shows some causes (e.g., cardiovascular diseases) are declining in developed regions but rising in others.
3. **Country breakdown** reveals mortality concentration — e.g., top countries contributing most to global totals.
4. **Clustering** highlights three groups of causes:
   - High-death & increasing trend (urgent policy focus)
   - High-death but declining trend (successful intervention areas)
   - Low-death, stable causes (less global impact)
5. The dataset’s wide format enables strong comparative analysis between countries and causes.
"""
display(Markdown(insights))

	Country/Territory	Code	Year	Meningitis	Alzheimer's Disease and Other Dementias	Parkinson's Disease	Nutritional Deficiencies	Malaria	Drowning	Interpersonal Violence	...	Diabetes Mellitus	Chronic Kidney Disease	Poisonings	Protein-Energy Malnutrition	Road Injuries	Chronic Respiratory Diseases	Cirrhosis and Other Chronic Liver Diseases	Digestive Diseases	Fire, Heat, and Hot Substances	Acute Hepatitis
0	Afghanistan	AFG	1990	2159	1116	371	2087	93	1370	1538	...	2108	3709	338	2054	4154	5945	2673	5005	323	2985
1	Afghanistan	AFG	1991	2218	1136	374	2153	189	1391	2001	...	2120	3724	351	2119	4472	6050	2728	5120	332	3092
2	Afghanistan	AFG	1992	2475	1162	378	2441	239	1514	2299	...	2153	3776	386	2404	5106	6223	2830	5335	360	3325
3	Afghanistan	AFG	1993	2812	1187	384	2837	108	1687	2589	...	2195	3862	425	2797	5681	6445	2943	5568	396	3601
4	Afghanistan	AFG	1994	3027	1211	391	3081	211	1809	2849	...	2231	3932	451	3038	6001	6664	3027	5739	420	3816

	Country/Territory	Code	Year	Cause	Deaths
0	Afghanistan	AFG	1990	Meningitis	2159
1	Afghanistan	AFG	1991	Meningitis	2218
2	Afghanistan	AFG	1992	Meningitis	2475
3	Afghanistan	AFG	1993	Meningitis	2812
4	Afghanistan	AFG	1994	Meningitis	3027

	Country/Territory	Code	Year	Cause	Deaths
0	Afghanistan	AFG	1990	Meningitis	2159
1	Afghanistan	AFG	1991	Meningitis	2218
2	Afghanistan	AFG	1992	Meningitis	2475
3	Afghanistan	AFG	1993	Meningitis	2812
4	Afghanistan	AFG	1994	Meningitis	3027

	Cause	Trend_Slope
11	Cardiovascular Diseases	204860.999778
19	Neoplasms	139936.132147
1	Alzheimer's Disease and Other Dementias	35945.690545
21	Diabetes Mellitus	29367.171969
22	Chronic Kidney Disease	29100.355506

	Cause	Trend_Slope
3	Nutritional Deficiencies	-19336.996663
10	Tuberculosis	-23578.541713
12	Lower Respiratory Infections	-31164.048053
13	Neonatal Disorders	-35603.060067
17	Diarrheal Diseases	-49195.923026

📊 Key Insights¶