# INSERT YOUR ANSWER HERE
USDA_micro <- read.csv('USDA_Micronutrients.csv')
USDA_macro <- read.csv('USDA_Macronutrients.csv')
head(USDA_micro)
head(USDA_macro)
names(USDA_macro)
## [1] "ID" "Description" "Calories" "Protein" "TotalFat"
## [6] "Carbohydrate"
names(USDA_micro)
## [1] "ID" "Sodium" "Cholesterol" "Sugar" "Calcium"
## [6] "Iron" "Potassium" "VitaminC" "VitaminE" "VitaminD"
# INSERT YOUR ANSWER HERE
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
USDA <- merge(USDA_micro, USDA_macro, on="ID")
head(USDA)
cat("USDA_macro has", nrow(USDA_macro), "and USDA_micro has", nrow(USDA_micro), "rows prior to merging. After merging, USDA has", nrow(USDA), "rows")
## USDA_macro has 7057 and USDA_micro has 7057 rows prior to merging. After merging, USDA has 7057 rows
# INSERT YOUR ANSWER HERE
sapply(USDA, class)
## ID Sodium Cholesterol Sugar Calcium Iron
## "integer" "factor" "integer" "numeric" "integer" "numeric"
## Potassium VitaminC VitaminE VitaminD Description Calories
## "factor" "numeric" "numeric" "numeric" "factor" "integer"
## Protein TotalFat Carbohydrate
## "numeric" "numeric" "numeric"
names(USDA)
## [1] "ID" "Sodium" "Cholesterol" "Sugar" "Calcium"
## [6] "Iron" "Potassium" "VitaminC" "VitaminE" "VitaminD"
## [11] "Description" "Calories" "Protein" "TotalFat" "Carbohydrate"
str(USDA)
## 'data.frame': 7057 obs. of 15 variables:
## $ ID : int 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
## $ Sodium : Factor w/ 1197 levels "","0","1","1,000",..: 972 1069 371 194 819 889 1084 946 882 960 ...
## $ Cholesterol : int 215 219 256 75 94 100 72 93 105 103 ...
## $ Sugar : num 0.06 0.06 0 0.5 0.51 0.45 0.46 NA 0.52 NA ...
## $ Calcium : int 24 24 4 528 674 184 388 673 721 643 ...
## $ Iron : num 0.02 0.16 0 0.31 0.43 0.5 0.33 0.64 0.68 0.21 ...
## $ Potassium : Factor w/ 886 levels "","0","1","1,000",..: 313 335 608 331 168 186 225 863 876 868 ...
## $ VitaminC : num 0 0 0 0 0 0 0 0 0 0 ...
## $ VitaminE : num 2.32 2.32 2.8 0.25 0.26 0.24 0.21 NA 0.29 NA ...
## $ VitaminD : num 1.5 1.5 1.8 0.5 0.5 0.5 0.4 NA 0.6 NA ...
## $ Description : Factor w/ 7053 levels "ABALONE,MIXED SPECIES,RAW",..: 1302 1301 1297 2302 2303 2304 2305 2306 2307 2308 ...
## $ Calories : int 717 717 876 353 371 334 300 376 403 387 ...
## $ Protein : num 0.85 0.85 0.28 21.4 23.24 ...
## $ TotalFat : num 81.1 81.1 99.5 28.7 29.7 ...
## $ Carbohydrate: num 0.06 0.06 0 2.34 2.79 0.45 0.46 3.06 1.28 4.78 ...
# create function to remove commas from columns
rmv_cmma <- function(x){paste(unlist(stringr::str_extract_all(pattern="\\w*[^,]\\w*", x)),collapse="")}
USDA$Sodium %<>% sapply(rmv_cmma) %>% as.numeric()
USDA$Potassium %<>% sapply(rmv_cmma) %>% as.numeric()
str(USDA)
## 'data.frame': 7057 obs. of 15 variables:
## $ ID : int 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
## $ Sodium : num 714 827 2 1395 560 ...
## $ Cholesterol : int 215 219 256 75 94 100 72 93 105 103 ...
## $ Sugar : num 0.06 0.06 0 0.5 0.51 0.45 0.46 NA 0.52 NA ...
## $ Calcium : int 24 24 4 528 674 184 388 673 721 643 ...
## $ Iron : num 0.02 0.16 0 0.31 0.43 0.5 0.33 0.64 0.68 0.21 ...
## $ Potassium : num 24 26 5 256 136 152 187 93 98 95 ...
## $ VitaminC : num 0 0 0 0 0 0 0 0 0 0 ...
## $ VitaminE : num 2.32 2.32 2.8 0.25 0.26 0.24 0.21 NA 0.29 NA ...
## $ VitaminD : num 1.5 1.5 1.8 0.5 0.5 0.5 0.4 NA 0.6 NA ...
## $ Description : Factor w/ 7053 levels "ABALONE,MIXED SPECIES,RAW",..: 1302 1301 1297 2302 2303 2304 2305 2306 2307 2308 ...
## $ Calories : int 717 717 876 353 371 334 300 376 403 387 ...
## $ Protein : num 0.85 0.85 0.28 21.4 23.24 ...
## $ TotalFat : num 81.1 81.1 99.5 28.7 29.7 ...
## $ Carbohydrate: num 0.06 0.06 0 2.34 2.79 0.45 0.46 3.06 1.28 4.78 ...
head(USDA)
# INSERT YOUR ANSWER HERE
#First we determine how many missing values are present in each column
missing_values <- is.na(USDA)
summary(missing_values)
## ID Sodium Cholesterol Sugar
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:7057 FALSE:6974 FALSE:6770 FALSE:5148
## TRUE :83 TRUE :287 TRUE :1909
## Calcium Iron Potassium VitaminC
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:6922 FALSE:6935 FALSE:6649 FALSE:6726
## TRUE :135 TRUE :122 TRUE :408 TRUE :331
## VitaminE VitaminD Description Calories
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:4338 FALSE:4224 FALSE:7057 FALSE:7057
## TRUE :2719 TRUE :2833
## Protein TotalFat Carbohydrate
## Mode :logical Mode :logical Mode :logical
## FALSE:7057 FALSE:7057 FALSE:7057
##
#With the geom_raster function in ggplot we can visualize the missing values
library(ggplot2)
library(reshape2)
missing_values %>% melt %>% ggplot(data=., aes(x=Var1, y=Var2)) + geom_raster(aes(fill=value))
We can easily visualize a number of records here with more than 4 attributes containing missing values. Now we attempt to remove these. The columns that contain missing values include VitaminD, VitaminE, VitaminC, Potassium, Iron, Calcium, Sugar, Cholesterol and Sodium.
USDA1 <- USDA
q=apply(USDA1, 1, function(x){sum(is.na(x))>4})
USDA1 <- cbind(USDA1, q)
USDA1 <- USDA1[-c(which(USDA1$q==1)),]
USDA1 <- USDA1 %>% select(-q)
seq(length=nrow(USDA1)) -> rownames(USDA1)
head(USDA1)
missing_values <- is.na(USDA1)
missing_values %>% melt %>% ggplot(data=., aes(x=Var1, y=Var2)) + geom_raster(aes(fill=value))
cat("After removing missing rows, there are now", nrow(USDA1),"rows in the dataset")
## After removing missing rows, there are now 6887 rows in the dataset
# INSERT YOUR ANSWER HERE
USDA1 <- USDA1 %>%
transform(Sugar = ifelse(is.na(Sugar), mean(Sugar, na.rm = TRUE), Sugar),
VitaminE = ifelse(is.na(VitaminE), mean(VitaminE, na.rm = TRUE), VitaminE),
VitaminD = ifelse(is.na(VitaminD), mean(VitaminD, na.rm = TRUE), VitaminD))
Again we examine the heatmap:
is.na(USDA1) %>%
melt %>%
ggplot(data=., aes(x=Var1, y=Var2)) + geom_raster(aes(fill=value))
# INSERT YOUR ANSWER HERE
USDAclean <- na.omit(USDA1)
seq(length=nrow(USDAclean)) -> rownames(USDAclean)
cat("The dataframe remains with",nrow(USDAclean),"rows")
## The dataframe remains with 6310 rows
Looking at the heatmap:
is.na(USDAclean) %>%
melt %>%
ggplot(data=., aes(x=Var1, y=Var2)) + geom_raster(aes(fill=value))
# INSERT YOUR ANSWER HERE
USDAclean[which.max(USDAclean$Sodium),]$Description
## [1] SALT,TABLE
## 7053 Levels: ABALONE,MIXED SPECIES,RAW ABALONE,MXD SP,CKD,FRIED ... ZWIEBACK
# INSERT YOUR ANSWER HERE
hist(USDAclean$VitaminC, xlim=c(0,100), breaks=100, main="Histogram of VitaminC",
xlab="VitaminC")
Re-attempting in log scale:
hist(log(USDAclean$VitaminC), breaks=100, main=bquote("Histogram of VitaminC in" ~ log ~ "scale"), xlab="VitaminC")
# INSERT YOUR ANSWER HERE
#par(mfrow=c(2,2))
USDAclean %>% select(.data$TotalFat, .data$Protein, .data$Carbohydrate) %>% melt %>% ggplot(data=.,aes(x=variable,y=value))+geom_boxplot() + labs(title="Boxplot of TotalFat, Protein & Carbohydrate", x = "Food Type", y="Values")
## No id variables; using all as measure variables
# INSERT YOUR ANSWER HERE
plot(Calories~TotalFat, data=USDAclean, pch=19, main="Food TotalFat vs. Calories")
We can attempt to get some more clarity in plot by adding the sugat content in the third dimension as shape
plot(Calories~TotalFat, col=Sugar, data=USDAclean, main="Food TotalFat vs. Calories")
The color does not seem to show any bias in sugar content with respect to TotalFat and calories in the food records provided.
# INSERT YOUR ANSWER HERE
# USDAclean <- USDAclean %>% select(-q)
USDAclean <- USDAclean %>% transform(HighSodium = ifelse(Sodium > mean(Sodium),1,0),
HighCalories = ifelse(Calories > mean(Calories),1,0),
HighProtein = ifelse(Protein > mean(Protein),1,0),
HighSugar = ifelse(Sugar > mean(Sugar),1,0),
HighFat = ifelse(TotalFat > mean(TotalFat),1,0))
head(USDAclean)
subset(USDAclean, (HighSodium==1 & HighFat==1)) %>% nrow
## [1] 644
644 Foods have both high sodium and high fat
# INSERT YOUR ANSWER HERE
USDAclean %>% group_by(HighProtein) %>% summarise(Iron = mean(Iron)) %>% .[order(-.[,1]),]
require(jpeg)
## Loading required package: jpeg
img<-readJPEG("HealthCheck.jpg")
plot(1:4, ty = 'n', ann = FALSE, xaxt = 'n', yaxt = 'n')
rasterImage(img,1,1,4,4)
# INSERT YOUR ANSWER HERE
hprogram <- function(df){
df <- df %>%
dplyr::mutate(healthcheck = case_when(
HighSodium == 0 ~ "Pass",
HighSodium == 1 & HighSugar == 0 ~ "Pass",
HighSodium == 1 & HighSugar == 1 & HighFat == 0 ~ "Pass",
HighSodium == 1 & HighSugar == 1 & HighFat == 1 ~ "Fail"
))
return(df)
}
# INSERT YOUR ANSWER HERE
USDAclean <- USDAclean %>% hprogram
head(USDAclean)
# INSERT YOUR ANSWER HERE
subset(USDAclean, healthcheck=="Fail") %>% nrow
## [1] 237
A total of 237 foods failed the health check
# INSERT YOUR ANSWER HERE
USDAclean_new <- USDAclean
str(USDAclean_new)
## 'data.frame': 6310 obs. of 21 variables:
## $ ID : int 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
## $ Sodium : num 714 827 2 1395 560 ...
## $ Cholesterol : int 215 219 256 75 94 100 72 93 105 103 ...
## $ Sugar : num 0.06 0.06 0 0.5 0.51 ...
## $ Calcium : int 24 24 4 528 674 184 388 673 721 643 ...
## $ Iron : num 0.02 0.16 0 0.31 0.43 0.5 0.33 0.64 0.68 0.21 ...
## $ Potassium : num 24 26 5 256 136 152 187 93 98 95 ...
## $ VitaminC : num 0 0 0 0 0 0 0 0 0 0 ...
## $ VitaminE : num 2.32 2.32 2.8 0.25 0.26 ...
## $ VitaminD : num 1.5 1.5 1.8 0.5 0.5 ...
## $ Description : Factor w/ 7053 levels "ABALONE,MIXED SPECIES,RAW",..: 1302 1301 1297 2302 2303 2304 2305 2306 2307 2308 ...
## $ Calories : int 717 717 876 353 371 334 300 376 403 387 ...
## $ Protein : num 0.85 0.85 0.28 21.4 23.24 ...
## $ TotalFat : num 81.1 81.1 99.5 28.7 29.7 ...
## $ Carbohydrate: num 0.06 0.06 0 2.34 2.79 0.45 0.46 3.06 1.28 4.78 ...
## $ HighSodium : num 1 1 0 1 1 1 1 1 1 1 ...
## $ HighCalories: num 1 1 1 1 1 1 1 1 1 1 ...
## $ HighProtein : num 0 0 0 1 1 1 1 1 1 1 ...
## $ HighSugar : num 0 0 0 0 0 0 0 1 0 1 ...
## $ HighFat : num 1 1 1 1 1 1 1 1 1 1 ...
## $ healthcheck : chr "Pass" "Pass" "Pass" "Pass" ...
head(USDAclean_new)