# Load necessary libraries
library(ggplot2)
# Load data
df <- read.csv("food_coded.csv")
df_subset <- df[1:100, ]
# Clean up GPA and calories_day
df_subset$GPA <- as.numeric(df_subset$GPA)
## Warning: NAs introduced by coercion
df_clean <- na.omit(df_subset[, c("GPA", "calories_day")])
# Plot GPA vs Calories per Day
ggplot(df_clean, aes(x = GPA, y = calories_day)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "GPA vs Daily Calorie Intake",
       x = "GPA",
       y = "Calories per Day")
## `geom_smooth()` using formula = 'y ~ x'

# Run regression
model <- lm(calories_day ~ GPA, data = df_clean)
summary(model)
## 
## Call:
## lm(formula = calories_day ~ GPA, data = df_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.02948 -0.02579 -0.02301 -0.02160  0.97839 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.039656   0.594886   5.110 2.07e-06 ***
## GPA         -0.004624   0.172431  -0.027    0.979    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6242 on 82 degrees of freedom
## Multiple R-squared:  8.768e-06,  Adjusted R-squared:  -0.01219 
## F-statistic: 0.000719 on 1 and 82 DF,  p-value: 0.9787