import os
os.environ['QT_QPA_PLATFORM_PLUGIN_PATH'] = 'C:/ProgramData/Anaconda3/Library/plugins/platforms'

#Start of Jupyter

import pandas as pd, numpy as np, matplotlib.pyplot as plt
from textwrap import fill

CSV_PATH = "starbucks.csv"  # change if needed
IMG_PAD = {"dpi": 220, "bbox_inches": "tight"}

df = pd.read_csv(CSV_PATH)
df.columns = df.columns.str.strip()  # fix leading/trailing spaces in headers
#print("Shape:", df.shape)
#print("Columns:", list(df.columns))

#line 2

def resolve_col(cands):
    for c in cands:
        if c in df.columns: 
            return c
    raise KeyError(cands)

col_cal   = resolve_col(["Calories","calories"])
col_sugar = resolve_col(["Sugars (g)","Sugar (g)","Sugars"])
col_fat   = resolve_col(["Total Fat (g)","Fat (g)","Total Fat"])
col_carb  = resolve_col(["Total Carbohydrates (g)","Carbohydrates (g)","Total Carbohydrates"])
col_prot  = resolve_col(["Protein (g)","Protein"])
col_cat   = resolve_col(["Beverage_category","Beverage Category"])
col_prep  = resolve_col(["Beverage_prep","Beverage Preparation","Beverage Prep"])
col_name  = resolve_col(["Beverage","Beverage Name","Drink"])

# ensure numeric types
for c in [col_cal,col_sugar,col_fat,col_carb,col_prot]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

d = df.copy()
#print("Resolved:", col_cal, col_sugar, col_fat, col_carb, col_prot, col_cat, col_prep, col_name)

Starbucks beverages vary widely in calories and nutritional content depending on preparation, size, and ingredients. This analysis uses the Starbucks Nutrition dataset to visualize and explain key trends in calories, sugars, and macronutrients across beverage categories.

The report explores five distinct visualizations to understand how drink type, size, and ingredients contribute to total calorie content. These insights can help consumers make more informed choices and highlight how customization affects nutritional balance.


#line 3 

intro = (
    "A Nutritional Breakdown of Starbucks Beverages: descriptive analysis of calories and key macronutrients "
    "across categories and sizes using five distinct visualizations."
)
#print(fill(intro, 100))

desc = d[[col_cal,col_sugar,col_fat,col_carb,col_prot]].describe().round(2)
#print("\nDescriptives (Calories, Sugars, Fat, Carbs, Protein):")
#print(desc)
#line 4 

avg_cal = d.groupby(col_cat, dropna=False)[col_cal].mean().sort_values()

fig, ax = plt.subplots(figsize=(10,5))
ax.bar(avg_cal.index.astype(str), avg_cal.values)
ax.set_title("Average Calories by Beverage Category")
ax.set_xlabel("Beverage Category"); ax.set_ylabel("Average Calories")
# rotate + right-align labels (avoid tick_params ha)
for lab in ax.get_xticklabels():
    lab.set_rotation(45)
    lab.set_horizontalalignment("right")
plt.tight_layout(); plt.savefig("viz1_avg_calories_by_category.png", **IMG_PAD); plt.show()


interp1 = (
    "Interpretation: Blended/specialty categories show the highest average calories; brewed coffee and tea are lowest. "
    "This indicates syrups, bases, and dairy additions drive category-level calories."
)
#print(fill(interp1, 100))

Interpretation: Blended and specialty beverages have the highest average calories due to syrups and dairy bases, while brewed coffee and tea remain the lowest. This pattern highlights the calorie impact of added ingredients.

#line 5 

d["Size"] = d[col_prep].astype(str).str.split().str[0]
order = ["Short","Tall","Grande","Venti"]
data_by_size = [d.loc[d["Size"]==s, col_cal].dropna() for s in order]

fig, ax = plt.subplots(figsize=(8,5))
ax.boxplot(data_by_size, labels=order, showfliers=True)
ax.set_title("Distribution of Calories by Drink Size")
ax.set_xlabel("Drink Size"); ax.set_ylabel("Calories")
plt.tight_layout(); plt.savefig("viz2_calories_by_size_boxplot.png", **IMG_PAD); plt.show()


interp2 = (
    "Interpretation: Calories rise with size on average, but wide spreads show that ingredients and preparation "
    "(syrups, sauces, milk type, toppings) can outweigh size alone. The most noticable jump between sizes happens from tall to grande. "
    "The average calories between any given drink doesn't change much between grande and venti. "
)
#print(fill(interp2, 100))

Interpretation: Calories rise with size on average, but wide spreads show that ingredients and preparation (syrups, sauces, milk type, toppings) can outweigh size alone. The most noticable jump between sizes happens from tall to grande. The average calories between any given drink doesn’t change much between grande and venti.

#line 6 

S = d[[col_cal,col_sugar]].dropna()
x, y = S[col_sugar].values, S[col_cal].values
m, b = np.polyfit(x, y, 1)
xs = np.sort(x)
corr = np.corrcoef(x,y)[0,1]

fig, ax = plt.subplots(figsize=(7,5))
ax.scatter(x, y, alpha=0.6, s=20)
ax.plot(xs, m*xs + b, linewidth=2)
ax.set_title("Calories vs. Sugars")
ax.set_xlabel("Sugars (g)"); ax.set_ylabel("Calories")
ax.text(0.02, 0.95, f"r = {corr:.2f}", transform=ax.transAxes, va="top")
plt.tight_layout(); plt.savefig("viz3_calories_vs_sugars_scatter.png", **IMG_PAD); plt.show()


interp3 = (
    f"Interpretation: Calories and sugars have a strong positive relationship (r ≈ {corr:.2f}), supporting sugar as a "
    "major calorie source. Deviations indicate other contributors—especially fat from dairy or toppings."
)
#print(fill(interp3, 100))

Interpretation: A strong positive correlation between sugar and calories confirms that sugar content is a major calorie source. Drinks deviating from the trend line likely contain higher fat from milk or whipped cream.

#line 7 

# pick one high-calorie beverage per category, then average macros across preps
rep = d.groupby([col_cat,col_name])[col_cal].mean().reset_index()
idx = rep.groupby(col_cat)[col_cal].idxmax()
top_bevs = rep.loc[idx, col_name]

macros = d.groupby(col_name)[[col_fat,col_carb,col_prot]].mean().dropna()
macros = macros.loc[macros.index.intersection(top_bevs)].head(6)  # ≤6 for readability
labels = macros.index.astype(str)

fig, ax = plt.subplots(figsize=(10,5))
bottom = np.zeros(len(macros))
for macro_col in [col_fat, col_carb, col_prot]:
    vals = macros[macro_col].values
    ax.bar(labels, vals, bottom=bottom, label=macro_col)
    bottom += vals

ax.set_title("Macronutrient Composition (g) of Representative High-Calorie Drinks")
ax.set_xlabel("Beverage"); ax.set_ylabel("Grams per Drink (avg across preps)")
ax.legend(title="Macronutrient")
for lab in ax.get_xticklabels():
    lab.set_rotation(35)
    lab.set_horizontalalignment("right")
plt.tight_layout(); plt.savefig("viz4_macros_stacked_bar.png", **IMG_PAD); plt.show()


interp4 = (
    "Interpretation: Carbohydrates (incl. sugars) dominate the macro mix; fat contributes substantially in dairy-heavy "
    "items; protein is minimal. High-calorie drinks are largely carbs + fat driven."
)
#print(fill(interp4, 100))

Interpretation: Carbohydrates, including sugars, dominate the nutrient composition of Starbucks drinks, followed by fats in dairy-heavy items. Protein remains minimal across most categories.

#line 8 

# ensure ordered size averages
size_order = ["Short","Tall","Grande","Venti"]
size_avg = (
    d[d["Size"].isin(size_order)]
      .groupby("Size")[col_cal].mean()
      .reindex(size_order)
)

fig, ax = plt.subplots(figsize=(8,5))
ax.plot(size_avg.index, size_avg.values, marker="o")
ax.set_title("Average Calories by Drink Size")
ax.set_xlabel("Drink Size"); ax.set_ylabel("Average Calories")
plt.tight_layout(); plt.savefig("viz5_calories_by_size_line.png", **IMG_PAD); plt.show()


interp5 = (
    "Interpretation: Average calories increase consistently from Short to Venti, indicating portion size is a strong "
    "determinant of calorie totals. Earlier figures show ingredients can still make smaller sizes overlap with larger ones."
)
#print(fill(interp5, 100))

Interpretation: The smooth upward trend from Short to Venti sizes reinforces that portion size consistently drives calorie increases, though individual recipes can still cause overlap across sizes.

This analysis reveals that Starbucks beverage calories are largely driven by sugar content and portion size, with additional variation from fat-heavy preparations.

From a consumer standpoint, opting for smaller sizes, milk alternatives, and fewer syrups can significantly reduce total calories and sugar intake.

From a business perspective, the data emphasizes the importance of transparent nutritional labeling and customization tools that help customers understand how each ingredient impacts the final product.