setwd("C:/Users/jzchen/Documents/Courses/Analytics Edge/Unit 1")
# Read the csv file
  USDA = read.csv("USDA.csv")
# Structure of the dataset
  str(USDA)
## 'data.frame':    7058 obs. of  16 variables:
##  $ ID          : int  1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
##  $ Description : Factor w/ 7054 levels "ABALONE,MIXED SPECIES,RAW",..: 1303 1302 1298 2303 2304 2305 2306 2307 2308 2309 ...
##  $ Calories    : int  717 717 876 353 371 334 300 376 403 387 ...
##  $ Protein     : num  0.85 0.85 0.28 21.4 23.24 ...
##  $ TotalFat    : num  81.1 81.1 99.5 28.7 29.7 ...
##  $ Carbohydrate: num  0.06 0.06 0 2.34 2.79 0.45 0.46 3.06 1.28 4.78 ...
##  $ Sodium      : int  714 827 2 1395 560 629 842 690 621 700 ...
##  $ SaturatedFat: num  51.4 50.5 61.9 18.7 18.8 ...
##  $ Cholesterol : int  215 219 256 75 94 100 72 93 105 103 ...
##  $ Sugar       : num  0.06 0.06 0 0.5 0.51 0.45 0.46 NA 0.52 NA ...
##  $ Calcium     : int  24 24 4 528 674 184 388 673 721 643 ...
##  $ Iron        : num  0.02 0.16 0 0.31 0.43 0.5 0.33 0.64 0.68 0.21 ...
##  $ Potassium   : int  24 26 5 256 136 152 187 93 98 95 ...
##  $ VitaminC    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VitaminE    : num  2.32 2.32 2.8 0.25 0.26 0.24 0.21 NA 0.29 NA ...
##  $ VitaminD    : num  1.5 1.5 1.8 0.5 0.5 0.5 0.4 NA 0.6 NA ...
# Statistical summary
  summary(USDA)
##        ID       
##  Min.   : 1001  
##  1st Qu.: 8387  
##  Median :13294  
##  Mean   :14260  
##  3rd Qu.:18337  
##  Max.   :93600  
##                 
##                                                        Description  
##  BEEF,CHUCK,UNDER BLADE CNTR STEAK,BNLESS,DENVER CUT,LN,0" FA:   2  
##  CAMPBELL,CAMPBELL'S SEL MICROWAVEABLE BOWLS,HEA             :   2  
##  OIL,INDUSTRIAL,PALM KERNEL (HYDROGENATED),CONFECTION FAT    :   2  
##  POPCORN,OIL-POPPED,LOFAT                                    :   2  
##  ABALONE,MIXED SPECIES,RAW                                   :   1  
##  ABALONE,MXD SP,CKD,FRIED                                    :   1  
##  (Other)                                                     :7048  
##     Calories        Protein         TotalFat       Carbohydrate   
##  Min.   :  0.0   Min.   : 0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.: 85.0   1st Qu.: 2.29   1st Qu.:  0.72   1st Qu.:  0.00  
##  Median :181.0   Median : 8.20   Median :  4.37   Median :  7.13  
##  Mean   :219.7   Mean   :11.71   Mean   : 10.32   Mean   : 20.70  
##  3rd Qu.:331.0   3rd Qu.:20.43   3rd Qu.: 12.70   3rd Qu.: 28.17  
##  Max.   :902.0   Max.   :88.32   Max.   :100.00   Max.   :100.00  
##  NA's   :1       NA's   :1       NA's   :1        NA's   :1       
##      Sodium         SaturatedFat     Cholesterol          Sugar       
##  Min.   :    0.0   Min.   : 0.000   Min.   :   0.00   Min.   : 0.000  
##  1st Qu.:   37.0   1st Qu.: 0.172   1st Qu.:   0.00   1st Qu.: 0.000  
##  Median :   79.0   Median : 1.256   Median :   3.00   Median : 1.395  
##  Mean   :  322.1   Mean   : 3.452   Mean   :  41.55   Mean   : 8.257  
##  3rd Qu.:  386.0   3rd Qu.: 4.028   3rd Qu.:  69.00   3rd Qu.: 7.875  
##  Max.   :38758.0   Max.   :95.600   Max.   :3100.00   Max.   :99.800  
##  NA's   :84        NA's   :301      NA's   :288       NA's   :1910    
##     Calcium             Iron           Potassium          VitaminC       
##  Min.   :   0.00   Min.   :  0.000   Min.   :    0.0   Min.   :   0.000  
##  1st Qu.:   9.00   1st Qu.:  0.520   1st Qu.:  135.0   1st Qu.:   0.000  
##  Median :  19.00   Median :  1.330   Median :  250.0   Median :   0.000  
##  Mean   :  73.53   Mean   :  2.828   Mean   :  301.4   Mean   :   9.436  
##  3rd Qu.:  56.00   3rd Qu.:  2.620   3rd Qu.:  348.0   3rd Qu.:   3.100  
##  Max.   :7364.00   Max.   :123.600   Max.   :16500.0   Max.   :2400.000  
##  NA's   :136       NA's   :123       NA's   :409       NA's   :332       
##     VitaminE          VitaminD       
##  Min.   :  0.000   Min.   :  0.0000  
##  1st Qu.:  0.120   1st Qu.:  0.0000  
##  Median :  0.270   Median :  0.0000  
##  Mean   :  1.488   Mean   :  0.5769  
##  3rd Qu.:  0.710   3rd Qu.:  0.1000  
##  Max.   :149.400   Max.   :250.0000  
##  NA's   :2720      NA's   :2834
# Video 3 - Basic Data Analysis

# Finding the index of the food with highest sodium levels
  which.max(USDA$Sodium)
## [1] 265
# Get names of variables in the dataset
  names(USDA)
##  [1] "ID"           "Description"  "Calories"     "Protein"     
##  [5] "TotalFat"     "Carbohydrate" "Sodium"       "SaturatedFat"
##  [9] "Cholesterol"  "Sugar"        "Calcium"      "Iron"        
## [13] "Potassium"    "VitaminC"     "VitaminE"     "VitaminD"
# Get the name of the food with highest sodium levels
  USDA$Description[265]
## [1] SALT,TABLE
## 7054 Levels: ABALONE,MIXED SPECIES,RAW ... ZWIEBACK
# Create a subset of the foods with sodium content above 10,000mg
  HighSodium = subset(USDA, Sodium>10000)
# Count the number of rows, or observations
  nrow(HighSodium)
## [1] 10
# Output names of the foods with high sodium content
  HighSodium$Description
##  [1] SALT,TABLE                                             
##  [2] SOUP,BF BROTH OR BOUILLON,PDR,DRY                      
##  [3] SOUP,BEEF BROTH,CUBED,DRY                              
##  [4] SOUP,CHICK BROTH OR BOUILLON,DRY                       
##  [5] SOUP,CHICK BROTH CUBES,DRY                             
##  [6] GRAVY,AU JUS,DRY                                       
##  [7] ADOBO FRESCO                                           
##  [8] LEAVENING AGENTS,BAKING PDR,DOUBLE-ACTING,NA AL SULFATE
##  [9] LEAVENING AGENTS,BAKING SODA                           
## [10] DESSERTS,RENNIN,TABLETS,UNSWTND                        
## 7054 Levels: ABALONE,MIXED SPECIES,RAW ... ZWIEBACK
# Finding the index of CAVIAR in the dataset
  match("CAVIAR", USDA$Description)
## [1] 4154
# Find amount of sodium in caviar
  USDA$Sodium[4154]
## [1] 1500
# Doing it in one command!
  USDA$Sodium[match("CAVIAR", USDA$Description)]
## [1] 1500
# Summary function over Sodium vector
  summary(USDA$Sodium)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0    37.0    79.0   322.1   386.0 38760.0      84
# Standard deviation
  sd(USDA$Sodium, na.rm = TRUE)
## [1] 1045.417
# Video 4 - Plots

# Scatter Plots
  plot(USDA$Protein, USDA$TotalFat)

# Add xlabel, ylabel and title
  plot(USDA$Protein, USDA$TotalFat, xlab="Protein", ylab = "Fat", main = "Protein vs Fat", col = "red")

# Creating a histogram
  hist(USDA$VitaminC, xlab = "Vitamin C (mg)", main = "Histogram of Vitamin C")

# Add limits to x-axis
  hist(USDA$VitaminC, xlab = "Vitamin C (mg)", main = "Histogram of Vitamin C", xlim = c(0,100))

# Specify breaks of histogram
  hist(USDA$VitaminC, xlab = "Vitamin C (mg)", main = "Histogram of Vitamin C", xlim = c(0,100), breaks=100)

  hist(USDA$VitaminC, xlab = "Vitamin C (mg)", main = "Histogram of Vitamin C", xlim = c(0,100), breaks=2000)

# Boxplots
  boxplot(USDA$Sugar, ylab = "Sugar (g)", main = "Boxplot of Sugar")

# Video 5 - Adding a variable

# Creating a variable that takes value 1 if the food has higher sodium than average, 0 otherwise
  HighSodium = as.numeric(USDA$Sodium > mean(USDA$Sodium, na.rm=TRUE))
  str(HighSodium)
##  num [1:7058] 1 1 0 1 1 1 1 1 1 1 ...
# Adding the variable to the dataset
  USDA$HighSodium = as.numeric(USDA$Sodium > mean(USDA$Sodium, na.rm=TRUE))
# Similarly for HighProtein, HigCarbs, HighFat
  USDA$HighCarbs = as.numeric(USDA$Carbohydrate > mean(USDA$Carbohydrate, na.rm=TRUE))
  USDA$HighProtein = as.numeric(USDA$Protein > mean(USDA$Protein, na.rm=TRUE))
  USDA$HighFat = as.numeric(USDA$TotalFat > mean(USDA$TotalFat, na.rm=TRUE))


# Video 6 - Summary Tables

# How many foods have higher sodium level than average?
  table(USDA$HighSodium)
## 
##    0    1 
## 4884 2090
# How many foods have both high sodium and high fat?
  table(USDA$HighSodium, USDA$HighFat)
##    
##        0    1
##   0 3529 1355
##   1 1378  712
# Average amount of iron sorted by high and low protein?
  tapply(USDA$Iron, USDA$HighProtein, mean, na.rm=TRUE)
##        0        1 
## 2.558945 3.197294
# Maximum level of Vitamin C in hfoods with high and low carbs?
  tapply(USDA$VitaminC, USDA$HighCarbs, max, na.rm=TRUE)
##      0      1 
## 1677.6 2400.0
# Using summary function with tapply
  tapply(USDA$VitaminC, USDA$HighCarbs, summary, na.rm=TRUE)
## $`0`
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##    0.000    0.000    0.000    6.364    2.800 1678.000      248 
## 
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00    0.20   16.31    4.50 2400.00      83