setwd("C:/Users/jzchen/Documents/Courses/Analytics Edge/Unit 1")
# Read the csv file
USDA = read.csv("USDA.csv")
# Structure of the dataset
str(USDA)
## 'data.frame': 7058 obs. of 16 variables:
## $ ID : int 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
## $ Description : Factor w/ 7054 levels "ABALONE,MIXED SPECIES,RAW",..: 1303 1302 1298 2303 2304 2305 2306 2307 2308 2309 ...
## $ Calories : int 717 717 876 353 371 334 300 376 403 387 ...
## $ Protein : num 0.85 0.85 0.28 21.4 23.24 ...
## $ TotalFat : num 81.1 81.1 99.5 28.7 29.7 ...
## $ Carbohydrate: num 0.06 0.06 0 2.34 2.79 0.45 0.46 3.06 1.28 4.78 ...
## $ Sodium : int 714 827 2 1395 560 629 842 690 621 700 ...
## $ SaturatedFat: num 51.4 50.5 61.9 18.7 18.8 ...
## $ Cholesterol : int 215 219 256 75 94 100 72 93 105 103 ...
## $ Sugar : num 0.06 0.06 0 0.5 0.51 0.45 0.46 NA 0.52 NA ...
## $ Calcium : int 24 24 4 528 674 184 388 673 721 643 ...
## $ Iron : num 0.02 0.16 0 0.31 0.43 0.5 0.33 0.64 0.68 0.21 ...
## $ Potassium : int 24 26 5 256 136 152 187 93 98 95 ...
## $ VitaminC : num 0 0 0 0 0 0 0 0 0 0 ...
## $ VitaminE : num 2.32 2.32 2.8 0.25 0.26 0.24 0.21 NA 0.29 NA ...
## $ VitaminD : num 1.5 1.5 1.8 0.5 0.5 0.5 0.4 NA 0.6 NA ...
# Statistical summary
summary(USDA)
## ID
## Min. : 1001
## 1st Qu.: 8387
## Median :13294
## Mean :14260
## 3rd Qu.:18337
## Max. :93600
##
## Description
## BEEF,CHUCK,UNDER BLADE CNTR STEAK,BNLESS,DENVER CUT,LN,0" FA: 2
## CAMPBELL,CAMPBELL'S SEL MICROWAVEABLE BOWLS,HEA : 2
## OIL,INDUSTRIAL,PALM KERNEL (HYDROGENATED),CONFECTION FAT : 2
## POPCORN,OIL-POPPED,LOFAT : 2
## ABALONE,MIXED SPECIES,RAW : 1
## ABALONE,MXD SP,CKD,FRIED : 1
## (Other) :7048
## Calories Protein TotalFat Carbohydrate
## Min. : 0.0 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 85.0 1st Qu.: 2.29 1st Qu.: 0.72 1st Qu.: 0.00
## Median :181.0 Median : 8.20 Median : 4.37 Median : 7.13
## Mean :219.7 Mean :11.71 Mean : 10.32 Mean : 20.70
## 3rd Qu.:331.0 3rd Qu.:20.43 3rd Qu.: 12.70 3rd Qu.: 28.17
## Max. :902.0 Max. :88.32 Max. :100.00 Max. :100.00
## NA's :1 NA's :1 NA's :1 NA's :1
## Sodium SaturatedFat Cholesterol Sugar
## Min. : 0.0 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 37.0 1st Qu.: 0.172 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 79.0 Median : 1.256 Median : 3.00 Median : 1.395
## Mean : 322.1 Mean : 3.452 Mean : 41.55 Mean : 8.257
## 3rd Qu.: 386.0 3rd Qu.: 4.028 3rd Qu.: 69.00 3rd Qu.: 7.875
## Max. :38758.0 Max. :95.600 Max. :3100.00 Max. :99.800
## NA's :84 NA's :301 NA's :288 NA's :1910
## Calcium Iron Potassium VitaminC
## Min. : 0.00 Min. : 0.000 Min. : 0.0 Min. : 0.000
## 1st Qu.: 9.00 1st Qu.: 0.520 1st Qu.: 135.0 1st Qu.: 0.000
## Median : 19.00 Median : 1.330 Median : 250.0 Median : 0.000
## Mean : 73.53 Mean : 2.828 Mean : 301.4 Mean : 9.436
## 3rd Qu.: 56.00 3rd Qu.: 2.620 3rd Qu.: 348.0 3rd Qu.: 3.100
## Max. :7364.00 Max. :123.600 Max. :16500.0 Max. :2400.000
## NA's :136 NA's :123 NA's :409 NA's :332
## VitaminE VitaminD
## Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.120 1st Qu.: 0.0000
## Median : 0.270 Median : 0.0000
## Mean : 1.488 Mean : 0.5769
## 3rd Qu.: 0.710 3rd Qu.: 0.1000
## Max. :149.400 Max. :250.0000
## NA's :2720 NA's :2834
# Video 3 - Basic Data Analysis
# Finding the index of the food with highest sodium levels
which.max(USDA$Sodium)
## [1] 265
# Get names of variables in the dataset
names(USDA)
## [1] "ID" "Description" "Calories" "Protein"
## [5] "TotalFat" "Carbohydrate" "Sodium" "SaturatedFat"
## [9] "Cholesterol" "Sugar" "Calcium" "Iron"
## [13] "Potassium" "VitaminC" "VitaminE" "VitaminD"
# Get the name of the food with highest sodium levels
USDA$Description[265]
## [1] SALT,TABLE
## 7054 Levels: ABALONE,MIXED SPECIES,RAW ... ZWIEBACK
# Create a subset of the foods with sodium content above 10,000mg
HighSodium = subset(USDA, Sodium>10000)
# Count the number of rows, or observations
nrow(HighSodium)
## [1] 10
# Output names of the foods with high sodium content
HighSodium$Description
## [1] SALT,TABLE
## [2] SOUP,BF BROTH OR BOUILLON,PDR,DRY
## [3] SOUP,BEEF BROTH,CUBED,DRY
## [4] SOUP,CHICK BROTH OR BOUILLON,DRY
## [5] SOUP,CHICK BROTH CUBES,DRY
## [6] GRAVY,AU JUS,DRY
## [7] ADOBO FRESCO
## [8] LEAVENING AGENTS,BAKING PDR,DOUBLE-ACTING,NA AL SULFATE
## [9] LEAVENING AGENTS,BAKING SODA
## [10] DESSERTS,RENNIN,TABLETS,UNSWTND
## 7054 Levels: ABALONE,MIXED SPECIES,RAW ... ZWIEBACK
# Finding the index of CAVIAR in the dataset
match("CAVIAR", USDA$Description)
## [1] 4154
# Find amount of sodium in caviar
USDA$Sodium[4154]
## [1] 1500
# Doing it in one command!
USDA$Sodium[match("CAVIAR", USDA$Description)]
## [1] 1500
# Summary function over Sodium vector
summary(USDA$Sodium)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 37.0 79.0 322.1 386.0 38760.0 84
# Standard deviation
sd(USDA$Sodium, na.rm = TRUE)
## [1] 1045.417
# Video 4 - Plots
# Scatter Plots
plot(USDA$Protein, USDA$TotalFat)

# Add xlabel, ylabel and title
plot(USDA$Protein, USDA$TotalFat, xlab="Protein", ylab = "Fat", main = "Protein vs Fat", col = "red")

# Creating a histogram
hist(USDA$VitaminC, xlab = "Vitamin C (mg)", main = "Histogram of Vitamin C")

# Add limits to x-axis
hist(USDA$VitaminC, xlab = "Vitamin C (mg)", main = "Histogram of Vitamin C", xlim = c(0,100))

# Specify breaks of histogram
hist(USDA$VitaminC, xlab = "Vitamin C (mg)", main = "Histogram of Vitamin C", xlim = c(0,100), breaks=100)

hist(USDA$VitaminC, xlab = "Vitamin C (mg)", main = "Histogram of Vitamin C", xlim = c(0,100), breaks=2000)

# Boxplots
boxplot(USDA$Sugar, ylab = "Sugar (g)", main = "Boxplot of Sugar")

# Video 5 - Adding a variable
# Creating a variable that takes value 1 if the food has higher sodium than average, 0 otherwise
HighSodium = as.numeric(USDA$Sodium > mean(USDA$Sodium, na.rm=TRUE))
str(HighSodium)
## num [1:7058] 1 1 0 1 1 1 1 1 1 1 ...
# Adding the variable to the dataset
USDA$HighSodium = as.numeric(USDA$Sodium > mean(USDA$Sodium, na.rm=TRUE))
# Similarly for HighProtein, HigCarbs, HighFat
USDA$HighCarbs = as.numeric(USDA$Carbohydrate > mean(USDA$Carbohydrate, na.rm=TRUE))
USDA$HighProtein = as.numeric(USDA$Protein > mean(USDA$Protein, na.rm=TRUE))
USDA$HighFat = as.numeric(USDA$TotalFat > mean(USDA$TotalFat, na.rm=TRUE))
# Video 6 - Summary Tables
# How many foods have higher sodium level than average?
table(USDA$HighSodium)
##
## 0 1
## 4884 2090
# How many foods have both high sodium and high fat?
table(USDA$HighSodium, USDA$HighFat)
##
## 0 1
## 0 3529 1355
## 1 1378 712
# Average amount of iron sorted by high and low protein?
tapply(USDA$Iron, USDA$HighProtein, mean, na.rm=TRUE)
## 0 1
## 2.558945 3.197294
# Maximum level of Vitamin C in hfoods with high and low carbs?
tapply(USDA$VitaminC, USDA$HighCarbs, max, na.rm=TRUE)
## 0 1
## 1677.6 2400.0
# Using summary function with tapply
tapply(USDA$VitaminC, USDA$HighCarbs, summary, na.rm=TRUE)
## $`0`
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 0.000 6.364 2.800 1678.000 248
##
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.00 0.20 16.31 4.50 2400.00 83