micro <- read.csv(file= "USDA_Micronutrients.csv", sep= ",")
macro <- read.csv(file="USDA_Macronutrients.csv", sep =",")
USDA = merge(macro, micro)
sapply(USDA, class)
## ID Description Calories Protein TotalFat Carbohydrate
## "integer" "character" "integer" "numeric" "numeric" "numeric"
## Sodium Cholesterol Sugar Calcium Iron Potassium
## "character" "integer" "numeric" "integer" "numeric" "character"
## VitaminC VitaminE VitaminD
## "numeric" "numeric" "numeric"
USDA$Sodium = gsub(",", "", USDA$Sodium)
USDA$Potassium = gsub(",", "", USDA$Potassium)
USDA$Sodium = as.numeric(USDA$Sodium)
USDA$Potassium = as.numeric(USDA$Potassium)
na_count = apply(is.na(USDA), 1, sum)
USDA = USDA[na_count < 7,]
cat("Number of remaining records:", nrow(USDA))
## Number of remaining records: 6965
USDA$Sugar[is.na(USDA$Sugar)] = mean(USDA$Sugar[!is.na(USDA$Sugar)])
USDA$VitaminE[is.na(USDA$VitaminE)] = mean(USDA$VitaminE[!is.na(USDA$VitaminE)])
USDA$VitaminD[is.na(USDA$VitaminD)] = mean(USDA$VitaminD[!is.na(USDA$VitaminD)])
USDAclean = USDA[complete.cases(USDA),]
cat("Number of remaining records:", nrow(USDAclean))
## Number of remaining records: 6310
as.character(USDAclean$Description[USDAclean$Sodium == max(USDAclean$Sodium)])
## [1] "SALT,TABLE"
hist(USDAclean$VitaminC, ylim=range(1,100),xlab = paste("Vitamin C"),main = paste("Vitamin C Distribution"))
#### 9. Create one boxplot to illustrate the distribution of values for
TotalFat, Protein and Carbohydrate. (4 points)
with(USDAclean, boxplot(TotalFat, Protein, Carbohydrate))
with(USDAclean, plot(TotalFat, Calories))
USDAclean$HighSodium = 0
USDAclean$HighSodium[USDAclean$Sodium > mean(USDAclean$Sodium)] = 1
USDAclean$HighCalories = 0
USDAclean$HighCalories[USDAclean$Calories > mean(USDAclean$Calories)] = 1
USDAclean$HighProtein = 0
USDAclean$HighProtein[USDAclean$Protein > mean(USDAclean$Protein)] = 1
USDAclean$HighSugar = 0
USDAclean$HighSugar[USDAclean$Sugar > mean(USDAclean$Sugar)] = 1
USDAclean$HighFat = 0
USDAclean$HighFat[USDAclean$TotalFat > mean(USDAclean$TotalFat)] = 1
cat(sum(apply(USDAclean[c("HighSodium", "HighFat")], 1, function(x) sum(x) == 2)), "foods have both high sodium and high fat.")
## 644 foods have both high sodium and high fat.
MeanProteinIron <- aggregate(USDAclean$Iron,list(USDAclean$HighProtein),FUN = mean)
colnames(MeanProteinIron) <- c("low/high protein","AVG")
head(MeanProteinIron)
## low/high protein AVG
## 1 0 2.696634
## 2 1 3.069541
require(jpeg)
## Loading required package: jpeg
img<-readJPEG("HealthCheck.jpg")
plot(1:4, ty = 'n', ann = F, xaxt = 'n', yaxt = 'n')
rasterImage(img,1,1,4,4)
healthcheck = function(x) {
if (x$HighSodium == 0) return("Pass")
else if (x$HighSugar == 0) return("Pass")
else if (x$HighFat == 0) return("Pass")
else return("Fail")
}
for (i in 1:nrow(USDAclean)) {
USDAclean$HealthCheck[i] = healthcheck(USDAclean[i,])
}
sum(USDAclean$HealthCheck == 'Fail')
## [1] 237
cor(USDAclean[3:8])
## Calories Protein TotalFat Carbohydrate Sodium
## Calories 1.00000000 0.122122537 0.804495022 0.42460618 0.032321026
## Protein 0.12212254 1.000000000 0.057035611 -0.30471117 -0.003489485
## TotalFat 0.80449502 0.057035611 1.000000000 -0.12434291 0.002916089
## Carbohydrate 0.42460618 -0.304711167 -0.124342914 1.00000000 0.046838692
## Sodium 0.03232103 -0.003489485 0.002916089 0.04683869 1.000000000
## Cholesterol 0.02391933 0.269854840 0.093289601 -0.21937986 -0.017774863
## Cholesterol
## Calories 0.02391933
## Protein 0.26985484
## TotalFat 0.09328960
## Carbohydrate -0.21937986
## Sodium -0.01777486
## Cholesterol 1.00000000
cor.test(USDAclean$Calories,USDAclean$TotalFat)
##
## Pearson's product-moment correlation
##
## data: USDAclean$Calories and USDAclean$TotalFat
## t = 107.58, df = 6308, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7956139 0.8130305
## sample estimates:
## cor
## 0.804495
#The correlation between Calories and Total Fat statistically significant because
#their correlation coefficient is about 0.804495 which is close to a perfect positive correlation, 1.0.
MOD=summary(lm(Calories~Protein+TotalFat+Carbohydrate+Sodium+Cholesterol,data=USDAclean))
print(MOD)
##
## Call:
## lm(formula = Calories ~ Protein + TotalFat + Carbohydrate + Sodium +
## Cholesterol, data = USDAclean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -191.087 -3.832 0.426 5.147 291.011
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9882753 0.4832629 8.253 < 2e-16 ***
## Protein 3.9891994 0.0233550 170.807 < 2e-16 ***
## TotalFat 8.7716980 0.0143291 612.158 < 2e-16 ***
## Carbohydrate 3.7432001 0.0091404 409.522 < 2e-16 ***
## Sodium 0.0003383 0.0002189 1.545 0.122
## Cholesterol 0.0110138 0.0019861 5.545 3.05e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.92 on 6304 degrees of freedom
## Multiple R-squared: 0.9877, Adjusted R-squared: 0.9877
## F-statistic: 1.009e+05 on 5 and 6304 DF, p-value: < 2.2e-16
summary(aov(MOD,data = USDAclean))
## Df Sum Sq Mean Sq F value Pr(>F)
## Protein 1 2728899 2728899 7.620e+03 < 2e-16 ***
## TotalFat 1 116762840 116762840 3.260e+05 < 2e-16 ***
## Carbohydrate 1 61215495 61215495 1.709e+05 < 2e-16 ***
## Sodium 1 789 789 2.203e+00 0.138
## Cholesterol 1 11014 11014 3.075e+01 3.05e-08 ***
## Residuals 6304 2257685 358
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ANSWER: Sodium is the least significant variable since the F value is the highest making it less significant in comparison to the other variables. Not only that but the P value is above 0.05 as well which would also make it not statistically significant.
MOD=summary(lm(Calories~Protein+TotalFat+Carbohydrate+Cholesterol,data=USDAclean))
print(MOD)
##
## Call:
## lm(formula = Calories ~ Protein + TotalFat + Carbohydrate + Cholesterol,
## data = USDAclean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -191.220 -3.787 0.464 5.104 290.922
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.077907 0.479822 8.499 < 2e-16 ***
## Protein 3.989679 0.023355 170.824 < 2e-16 ***
## TotalFat 8.771904 0.014330 612.131 < 2e-16 ***
## Carbohydrate 3.743859 0.009131 409.996 < 2e-16 ***
## Cholesterol 0.010980 0.001986 5.528 3.36e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.93 on 6305 degrees of freedom
## Multiple R-squared: 0.9877, Adjusted R-squared: 0.9876
## F-statistic: 1.261e+05 on 4 and 6305 DF, p-value: < 2.2e-16
lm(Calories~Protein+TotalFat+Carbohydrate+Sodium+Cholesterol,data=USDAclean)
##
## Call:
## lm(formula = Calories ~ Protein + TotalFat + Carbohydrate + Sodium +
## Cholesterol, data = USDAclean)
##
## Coefficients:
## (Intercept) Protein TotalFat Carbohydrate Sodium
## 3.9882753 3.9891994 8.7716980 3.7432001 0.0003383
## Cholesterol
## 0.0110138
pred_value=3.9882753+(0.1)*3.9891994+(37)*8.7716980+(400)*3.7432001+(440)*0.0003383+(75)*0.0110138
print(pred_value)
## [1] 1827.195
#The predicted value would be 1827.195
pred_value=3.9882753+(0.1)*3.9891994+(37)*8.7716980+(400)*3.7432001+(40000)*0.0003383+(75)*0.0110138
print(pred_value)
## [1] 1840.578
increase=1842.08-1827.195
percentIncrease=increase/1827.195*100
print(percentIncrease)
## [1] 0.8146366
# Due to the low significance of the Carbohydrate coefficient any change will have a small change in the predictive value. This is why the percent increase is only 0.81 percent.