CMTH 642 Data Analytics: Advanced Methods Assignment 1

1. Read the csv files in the folder. (4 points)

# INSERT YOUR ANSWER HERE
USDA_micro <- read.csv('USDA_Micronutrients.csv')
USDA_macro <- read.csv('USDA_Macronutrients.csv')

head(USDA_micro)

head(USDA_macro)

names(USDA_macro)

## [1] "ID"           "Description"  "Calories"     "Protein"      "TotalFat"    
## [6] "Carbohydrate"

names(USDA_micro)

##  [1] "ID"          "Sodium"      "Cholesterol" "Sugar"       "Calcium"    
##  [6] "Iron"        "Potassium"   "VitaminC"    "VitaminE"    "VitaminD"

2. Merge the data frames using the variable “ID”. Name the Merged Data Frame “USDA”. (4 points)

# INSERT YOUR ANSWER HERE
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

USDA <- merge(USDA_micro, USDA_macro, on="ID")

head(USDA)

cat("USDA_macro has", nrow(USDA_macro), "and USDA_micro has", nrow(USDA_micro), "rows prior to merging. After merging, USDA has", nrow(USDA), "rows")

## USDA_macro has 7057 and USDA_micro has 7057 rows prior to merging. After merging, USDA has 7057 rows

3. Check the datatypes of the attributes. Delete the commas in the Sodium and Potasium records. Assign Sodium and Potasium as numeric data types. (6 points)

# INSERT YOUR ANSWER HERE
sapply(USDA, class)

##           ID       Sodium  Cholesterol        Sugar      Calcium         Iron 
##    "integer"     "factor"    "integer"    "numeric"    "integer"    "numeric" 
##    Potassium     VitaminC     VitaminE     VitaminD  Description     Calories 
##     "factor"    "numeric"    "numeric"    "numeric"     "factor"    "integer" 
##      Protein     TotalFat Carbohydrate 
##    "numeric"    "numeric"    "numeric"

names(USDA)

##  [1] "ID"           "Sodium"       "Cholesterol"  "Sugar"        "Calcium"     
##  [6] "Iron"         "Potassium"    "VitaminC"     "VitaminE"     "VitaminD"    
## [11] "Description"  "Calories"     "Protein"      "TotalFat"     "Carbohydrate"

str(USDA)

## 'data.frame':    7057 obs. of  15 variables:
##  $ ID          : int  1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
##  $ Sodium      : Factor w/ 1197 levels "","0","1","1,000",..: 972 1069 371 194 819 889 1084 946 882 960 ...
##  $ Cholesterol : int  215 219 256 75 94 100 72 93 105 103 ...
##  $ Sugar       : num  0.06 0.06 0 0.5 0.51 0.45 0.46 NA 0.52 NA ...
##  $ Calcium     : int  24 24 4 528 674 184 388 673 721 643 ...
##  $ Iron        : num  0.02 0.16 0 0.31 0.43 0.5 0.33 0.64 0.68 0.21 ...
##  $ Potassium   : Factor w/ 886 levels "","0","1","1,000",..: 313 335 608 331 168 186 225 863 876 868 ...
##  $ VitaminC    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VitaminE    : num  2.32 2.32 2.8 0.25 0.26 0.24 0.21 NA 0.29 NA ...
##  $ VitaminD    : num  1.5 1.5 1.8 0.5 0.5 0.5 0.4 NA 0.6 NA ...
##  $ Description : Factor w/ 7053 levels "ABALONE,MIXED SPECIES,RAW",..: 1302 1301 1297 2302 2303 2304 2305 2306 2307 2308 ...
##  $ Calories    : int  717 717 876 353 371 334 300 376 403 387 ...
##  $ Protein     : num  0.85 0.85 0.28 21.4 23.24 ...
##  $ TotalFat    : num  81.1 81.1 99.5 28.7 29.7 ...
##  $ Carbohydrate: num  0.06 0.06 0 2.34 2.79 0.45 0.46 3.06 1.28 4.78 ...

# create function to remove commas from columns
rmv_cmma <- function(x){paste(unlist(stringr::str_extract_all(pattern="\\w*[^,]\\w*", x)),collapse="")}

USDA$Sodium %<>% sapply(rmv_cmma) %>% as.numeric()
USDA$Potassium %<>% sapply(rmv_cmma) %>% as.numeric()
str(USDA)

## 'data.frame':    7057 obs. of  15 variables:
##  $ ID          : int  1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
##  $ Sodium      : num  714 827 2 1395 560 ...
##  $ Cholesterol : int  215 219 256 75 94 100 72 93 105 103 ...
##  $ Sugar       : num  0.06 0.06 0 0.5 0.51 0.45 0.46 NA 0.52 NA ...
##  $ Calcium     : int  24 24 4 528 674 184 388 673 721 643 ...
##  $ Iron        : num  0.02 0.16 0 0.31 0.43 0.5 0.33 0.64 0.68 0.21 ...
##  $ Potassium   : num  24 26 5 256 136 152 187 93 98 95 ...
##  $ VitaminC    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VitaminE    : num  2.32 2.32 2.8 0.25 0.26 0.24 0.21 NA 0.29 NA ...
##  $ VitaminD    : num  1.5 1.5 1.8 0.5 0.5 0.5 0.4 NA 0.6 NA ...
##  $ Description : Factor w/ 7053 levels "ABALONE,MIXED SPECIES,RAW",..: 1302 1301 1297 2302 2303 2304 2305 2306 2307 2308 ...
##  $ Calories    : int  717 717 876 353 371 334 300 376 403 387 ...
##  $ Protein     : num  0.85 0.85 0.28 21.4 23.24 ...
##  $ TotalFat    : num  81.1 81.1 99.5 28.7 29.7 ...
##  $ Carbohydrate: num  0.06 0.06 0 2.34 2.79 0.45 0.46 3.06 1.28 4.78 ...

head(USDA)

4. Remove records (rows) with missing values in more than 4 attributes (columns). How many records remain in the data frame? (6 points)

# INSERT YOUR ANSWER HERE

#First we determine how many missing values are present in each column
missing_values <- is.na(USDA)
summary(missing_values)

##      ID            Sodium        Cholesterol       Sugar        
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:7057      FALSE:6974      FALSE:6770      FALSE:5148     
##                  TRUE :83        TRUE :287       TRUE :1909     
##   Calcium           Iron         Potassium        VitaminC      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:6922      FALSE:6935      FALSE:6649      FALSE:6726     
##  TRUE :135       TRUE :122       TRUE :408       TRUE :331      
##   VitaminE        VitaminD       Description      Calories      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:4338      FALSE:4224      FALSE:7057      FALSE:7057     
##  TRUE :2719      TRUE :2833                                     
##   Protein         TotalFat       Carbohydrate   
##  Mode :logical   Mode :logical   Mode :logical  
##  FALSE:7057      FALSE:7057      FALSE:7057     
##

#With the geom_raster function in ggplot we can visualize the missing values
library(ggplot2)
library(reshape2)
missing_values %>% melt %>% ggplot(data=., aes(x=Var1, y=Var2)) + geom_raster(aes(fill=value))

We can easily visualize a number of records here with more than 4 attributes containing missing values. Now we attempt to remove these. The columns that contain missing values include VitaminD, VitaminE, VitaminC, Potassium, Iron, Calcium, Sugar, Cholesterol and Sodium.

USDA1 <- USDA
q=apply(USDA1, 1, function(x){sum(is.na(x))>4})
USDA1 <- cbind(USDA1, q)
USDA1 <- USDA1[-c(which(USDA1$q==1)),]
USDA1 <- USDA1 %>% select(-q)
seq(length=nrow(USDA1)) -> rownames(USDA1)
head(USDA1)

missing_values <- is.na(USDA1)
missing_values %>% melt %>% ggplot(data=., aes(x=Var1, y=Var2)) + geom_raster(aes(fill=value))

cat("After removing missing rows, there are now", nrow(USDA1),"rows in the dataset")

## After removing missing rows, there are now 6887 rows in the dataset

5. For records with missing values for Sugar, Vitamin E and Vitamin D, replace missing values with mean value for the respective variable. (6 points)

# INSERT YOUR ANSWER HERE
USDA1 <- USDA1 %>%
  transform(Sugar = ifelse(is.na(Sugar), mean(Sugar, na.rm = TRUE), Sugar), 
            VitaminE = ifelse(is.na(VitaminE), mean(VitaminE, na.rm = TRUE), VitaminE),
            VitaminD = ifelse(is.na(VitaminD), mean(VitaminD, na.rm = TRUE), VitaminD))

Again we examine the heatmap:

is.na(USDA1) %>% 
  melt %>% 
  ggplot(data=., aes(x=Var1, y=Var2)) + geom_raster(aes(fill=value))

6. With a single line of code, remove all remaining records with missing values. Name the new Data Frame “USDAclean”. How many records remain in the data frame? (6 points)

# INSERT YOUR ANSWER HERE
USDAclean <- na.omit(USDA1)
seq(length=nrow(USDAclean)) -> rownames(USDAclean)
cat("The dataframe remains with",nrow(USDAclean),"rows")

## The dataframe remains with 6310 rows

Looking at the heatmap:

is.na(USDAclean) %>% 
  melt %>% 
  ggplot(data=., aes(x=Var1, y=Var2)) + geom_raster(aes(fill=value))

7. Which food has the highest sodium level? (6 points)

# INSERT YOUR ANSWER HERE
USDAclean[which.max(USDAclean$Sodium),]$Description

## [1] SALT,TABLE
## 7053 Levels: ABALONE,MIXED SPECIES,RAW ABALONE,MXD SP,CKD,FRIED ... ZWIEBACK

8. Create a histogram of Vitamin C distribution in foods, with a limit of 0 to 100 on the x-axis and breaks of 100. (6 points)

# INSERT YOUR ANSWER HERE
hist(USDAclean$VitaminC, xlim=c(0,100), breaks=100, main="Histogram of VitaminC",
     xlab="VitaminC")

Re-attempting in log scale:

hist(log(USDAclean$VitaminC), breaks=100, main=bquote("Histogram of VitaminC in" ~ log ~ "scale"), xlab="VitaminC")

9. Create a boxplot to illustrate the distribution of values for TotalFat, Protein and Carbohydrate. (6 points)

# INSERT YOUR ANSWER HERE
#par(mfrow=c(2,2))
USDAclean %>% select(.data$TotalFat, .data$Protein, .data$Carbohydrate) %>% melt %>% ggplot(data=.,aes(x=variable,y=value))+geom_boxplot() + labs(title="Boxplot of TotalFat, Protein & Carbohydrate", x = "Food Type", y="Values")

## No id variables; using all as measure variables

10. Create a scatterplot to illustrate the relationship between a food’s TotalFat content and its calorie content. (6 points)

# INSERT YOUR ANSWER HERE
plot(Calories~TotalFat, data=USDAclean, pch=19, main="Food TotalFat vs. Calories")

We can attempt to get some more clarity in plot by adding the sugat content in the third dimension as shape

plot(Calories~TotalFat, col=Sugar, data=USDAclean, main="Food TotalFat vs. Calories")

The color does not seem to show any bias in sugar content with respect to TotalFat and calories in the food records provided.

11. Add a variable to the data frame that takes value 1 if the food has higher sodium than average, 0 otherwise.Call this variable HighSodium. Do the same for High Calories, High Protein, High Sugar, and High Fat. How many foods have both high sodium and high fat? (8 points)

# INSERT YOUR ANSWER HERE
# USDAclean <- USDAclean %>% select(-q)
USDAclean <- USDAclean %>% transform(HighSodium = ifelse(Sodium > mean(Sodium),1,0),
                        HighCalories = ifelse(Calories > mean(Calories),1,0),
                        HighProtein = ifelse(Protein > mean(Protein),1,0),
                        HighSugar = ifelse(Sugar > mean(Sugar),1,0),
                        HighFat = ifelse(TotalFat > mean(TotalFat),1,0))
head(USDAclean)

subset(USDAclean, (HighSodium==1 & HighFat==1)) %>% nrow

## [1] 644

644 Foods have both high sodium and high fat

12. Calculate the average amount of iron, sorted by high and low protein. (8 points)

# INSERT YOUR ANSWER HERE
USDAclean %>% group_by(HighProtein) %>% summarise(Iron = mean(Iron)) %>% .[order(-.[,1]),]

13. Create a script for a “HealthCheck” program to detect unhealthy foods. Use the algorithm flowchart below as a basis for this script. (8 points)

require(jpeg)

## Loading required package: jpeg

img<-readJPEG("HealthCheck.jpg")
plot(1:4, ty = 'n', ann = FALSE, xaxt = 'n', yaxt = 'n')
rasterImage(img,1,1,4,4)

# INSERT YOUR ANSWER HERE

hprogram <- function(df){
  df <- df %>% 
    dplyr::mutate(healthcheck = case_when(
      HighSodium == 0 ~ "Pass",
      HighSodium == 1 & HighSugar == 0 ~ "Pass",
      HighSodium == 1 & HighSugar == 1 & HighFat == 0 ~ "Pass",
      HighSodium == 1 & HighSugar == 1 & HighFat == 1 ~ "Fail"
    ))
  return(df)
}

14. Add a new variable called HealthCheck to the data frame using the output of the function. (8 points)

# INSERT YOUR ANSWER HERE
USDAclean <- USDAclean %>% hprogram
head(USDAclean)

15. How many foods in the USDAclean data frame fail the HealthCheck? (8 points)

# INSERT YOUR ANSWER HERE
subset(USDAclean, healthcheck=="Fail") %>% nrow

## [1] 237

A total of 237 foods failed the health check

16. Save your final data frame as “USDAclean_ [your last name]” (4 points)

# INSERT YOUR ANSWER HERE
USDAclean_new <- USDAclean
str(USDAclean_new)

## 'data.frame':    6310 obs. of  21 variables:
##  $ ID          : int  1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
##  $ Sodium      : num  714 827 2 1395 560 ...
##  $ Cholesterol : int  215 219 256 75 94 100 72 93 105 103 ...
##  $ Sugar       : num  0.06 0.06 0 0.5 0.51 ...
##  $ Calcium     : int  24 24 4 528 674 184 388 673 721 643 ...
##  $ Iron        : num  0.02 0.16 0 0.31 0.43 0.5 0.33 0.64 0.68 0.21 ...
##  $ Potassium   : num  24 26 5 256 136 152 187 93 98 95 ...
##  $ VitaminC    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VitaminE    : num  2.32 2.32 2.8 0.25 0.26 ...
##  $ VitaminD    : num  1.5 1.5 1.8 0.5 0.5 ...
##  $ Description : Factor w/ 7053 levels "ABALONE,MIXED SPECIES,RAW",..: 1302 1301 1297 2302 2303 2304 2305 2306 2307 2308 ...
##  $ Calories    : int  717 717 876 353 371 334 300 376 403 387 ...
##  $ Protein     : num  0.85 0.85 0.28 21.4 23.24 ...
##  $ TotalFat    : num  81.1 81.1 99.5 28.7 29.7 ...
##  $ Carbohydrate: num  0.06 0.06 0 2.34 2.79 0.45 0.46 3.06 1.28 4.78 ...
##  $ HighSodium  : num  1 1 0 1 1 1 1 1 1 1 ...
##  $ HighCalories: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ HighProtein : num  0 0 0 1 1 1 1 1 1 1 ...
##  $ HighSugar   : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ HighFat     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ healthcheck : chr  "Pass" "Pass" "Pass" "Pass" ...

head(USDAclean_new)