We will use USDA dataset from National Nutrient Database for Standard Reference.
USDA<-read.csv("C:\\Users\\aman96\\Desktop\\the analytics edge\\unit 1\\USDA.csv", header = TRUE)
top row of the data
head(USDA)
## ID Description Calories Protein TotalFat Carbohydrate
## 1 1001 BUTTER,WITH SALT 717 0.85 81.11 0.06
## 2 1002 BUTTER,WHIPPED,WITH SALT 717 0.85 81.11 0.06
## 3 1003 BUTTER OIL,ANHYDROUS 876 0.28 99.48 0.00
## 4 1004 CHEESE,BLUE 353 21.40 28.74 2.34
## 5 1005 CHEESE,BRICK 371 23.24 29.68 2.79
## 6 1006 CHEESE,BRIE 334 20.75 27.68 0.45
## Sodium SaturatedFat Cholesterol Sugar Calcium Iron Potassium VitaminC
## 1 714 51.368 215 0.06 24 0.02 24 0
## 2 827 50.489 219 0.06 24 0.16 26 0
## 3 2 61.924 256 0.00 4 0.00 5 0
## 4 1395 18.669 75 0.50 528 0.31 256 0
## 5 560 18.764 94 0.51 674 0.43 136 0
## 6 629 17.410 100 0.45 184 0.50 152 0
## VitaminE VitaminD
## 1 2.32 1.5
## 2 2.32 1.5
## 3 2.80 1.8
## 4 0.25 0.5
## 5 0.26 0.5
## 6 0.24 0.5
Exploring the USDA’s data
summary(USDA)
## ID
## Min. : 1001
## 1st Qu.: 8387
## Median :13294
## Mean :14260
## 3rd Qu.:18337
## Max. :93600
##
## Description
## BEEF,CHUCK,UNDER BLADE CNTR STEAK,BNLESS,DENVER CUT,LN,0" FA: 2
## CAMPBELL,CAMPBELL'S SEL MICROWAVEABLE BOWLS,HEA : 2
## OIL,INDUSTRIAL,PALM KERNEL (HYDROGENATED),CONFECTION FAT : 2
## POPCORN,OIL-POPPED,LOFAT : 2
## ABALONE,MIXED SPECIES,RAW : 1
## ABALONE,MXD SP,CKD,FRIED : 1
## (Other) :7048
## Calories Protein TotalFat Carbohydrate
## Min. : 0.0 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 85.0 1st Qu.: 2.29 1st Qu.: 0.72 1st Qu.: 0.00
## Median :181.0 Median : 8.20 Median : 4.37 Median : 7.13
## Mean :219.7 Mean :11.71 Mean : 10.32 Mean : 20.70
## 3rd Qu.:331.0 3rd Qu.:20.43 3rd Qu.: 12.70 3rd Qu.: 28.17
## Max. :902.0 Max. :88.32 Max. :100.00 Max. :100.00
## NA's :1 NA's :1 NA's :1 NA's :1
## Sodium SaturatedFat Cholesterol Sugar
## Min. : 0.0 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 37.0 1st Qu.: 0.172 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 79.0 Median : 1.256 Median : 3.00 Median : 1.395
## Mean : 322.1 Mean : 3.452 Mean : 41.55 Mean : 8.257
## 3rd Qu.: 386.0 3rd Qu.: 4.028 3rd Qu.: 69.00 3rd Qu.: 7.875
## Max. :38758.0 Max. :95.600 Max. :3100.00 Max. :99.800
## NA's :84 NA's :301 NA's :288 NA's :1910
## Calcium Iron Potassium VitaminC
## Min. : 0.00 Min. : 0.000 Min. : 0.0 Min. : 0.000
## 1st Qu.: 9.00 1st Qu.: 0.520 1st Qu.: 135.0 1st Qu.: 0.000
## Median : 19.00 Median : 1.330 Median : 250.0 Median : 0.000
## Mean : 73.53 Mean : 2.828 Mean : 301.4 Mean : 9.436
## 3rd Qu.: 56.00 3rd Qu.: 2.620 3rd Qu.: 348.0 3rd Qu.: 3.100
## Max. :7364.00 Max. :123.600 Max. :16500.0 Max. :2400.000
## NA's :136 NA's :123 NA's :409 NA's :332
## VitaminE VitaminD
## Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.120 1st Qu.: 0.0000
## Median : 0.270 Median : 0.0000
## Mean : 1.488 Mean : 0.5769
## 3rd Qu.: 0.710 3rd Qu.: 0.1000
## Max. :149.400 Max. :250.0000
## NA's :2720 NA's :2834
str(USDA)
## 'data.frame': 7058 obs. of 16 variables:
## $ ID : int 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
## $ Description : Factor w/ 7054 levels "ABALONE,MIXED SPECIES,RAW",..: 1303 1302 1298 2303 2304 2305 2306 2307 2308 2309 ...
## $ Calories : int 717 717 876 353 371 334 300 376 403 387 ...
## $ Protein : num 0.85 0.85 0.28 21.4 23.24 ...
## $ TotalFat : num 81.1 81.1 99.5 28.7 29.7 ...
## $ Carbohydrate: num 0.06 0.06 0 2.34 2.79 0.45 0.46 3.06 1.28 4.78 ...
## $ Sodium : int 714 827 2 1395 560 629 842 690 621 700 ...
## $ SaturatedFat: num 51.4 50.5 61.9 18.7 18.8 ...
## $ Cholesterol : int 215 219 256 75 94 100 72 93 105 103 ...
## $ Sugar : num 0.06 0.06 0 0.5 0.51 0.45 0.46 NA 0.52 NA ...
## $ Calcium : int 24 24 4 528 674 184 388 673 721 643 ...
## $ Iron : num 0.02 0.16 0 0.31 0.43 0.5 0.33 0.64 0.68 0.21 ...
## $ Potassium : int 24 26 5 256 136 152 187 93 98 95 ...
## $ VitaminC : num 0 0 0 0 0 0 0 0 0 0 ...
## $ VitaminE : num 2.32 2.32 2.8 0.25 0.26 0.24 0.21 NA 0.29 NA ...
## $ VitaminD : num 1.5 1.5 1.8 0.5 0.5 0.5 0.4 NA 0.6 NA ...
High sodium data analysis
Highsodium<-subset(USDA, Sodium>10000)
Highsodium$Description
## [1] SALT,TABLE
## [2] SOUP,BF BROTH OR BOUILLON,PDR,DRY
## [3] SOUP,BEEF BROTH,CUBED,DRY
## [4] SOUP,CHICK BROTH OR BOUILLON,DRY
## [5] SOUP,CHICK BROTH CUBES,DRY
## [6] GRAVY,AU JUS,DRY
## [7] ADOBO FRESCO
## [8] LEAVENING AGENTS,BAKING PDR,DOUBLE-ACTING,NA AL SULFATE
## [9] LEAVENING AGENTS,BAKING SODA
## [10] DESSERTS,RENNIN,TABLETS,UNSWTND
## 7054 Levels: ABALONE,MIXED SPECIES,RAW ... ZWIEBACK
summary(USDA$Sodium)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 37.0 79.0 322.1 386.0 38760.0 84
sd(USDA$Sodium, na.rm = TRUE)
## [1] 1045.417
Now visualization of the data using plot function
plot(USDA$Protein, USDA$TotalFat, xlab="Protein", ylab="Fat", main="Protein Vs Fat", col="red")
Making histograms
hist(USDA$VitaminC, xlab = "Vitamin C (mg)", main = "Histogram of Vitamin C levels", xlim = c(0,100), breaks = 2000)
Adding additional variables into dataset
Highsodium<- USDA$Sodium > mean(USDA$Sodium, na.rm = TRUE)
Highsodium<-as.numeric(Highsodium)
USDA$Highsodium<-Highsodium
str(USDA)
## 'data.frame': 7058 obs. of 17 variables:
## $ ID : int 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
## $ Description : Factor w/ 7054 levels "ABALONE,MIXED SPECIES,RAW",..: 1303 1302 1298 2303 2304 2305 2306 2307 2308 2309 ...
## $ Calories : int 717 717 876 353 371 334 300 376 403 387 ...
## $ Protein : num 0.85 0.85 0.28 21.4 23.24 ...
## $ TotalFat : num 81.1 81.1 99.5 28.7 29.7 ...
## $ Carbohydrate: num 0.06 0.06 0 2.34 2.79 0.45 0.46 3.06 1.28 4.78 ...
## $ Sodium : int 714 827 2 1395 560 629 842 690 621 700 ...
## $ SaturatedFat: num 51.4 50.5 61.9 18.7 18.8 ...
## $ Cholesterol : int 215 219 256 75 94 100 72 93 105 103 ...
## $ Sugar : num 0.06 0.06 0 0.5 0.51 0.45 0.46 NA 0.52 NA ...
## $ Calcium : int 24 24 4 528 674 184 388 673 721 643 ...
## $ Iron : num 0.02 0.16 0 0.31 0.43 0.5 0.33 0.64 0.68 0.21 ...
## $ Potassium : int 24 26 5 256 136 152 187 93 98 95 ...
## $ VitaminC : num 0 0 0 0 0 0 0 0 0 0 ...
## $ VitaminE : num 2.32 2.32 2.8 0.25 0.26 0.24 0.21 NA 0.29 NA ...
## $ VitaminD : num 1.5 1.5 1.8 0.5 0.5 0.5 0.4 NA 0.6 NA ...
## $ Highsodium : num 1 1 0 1 1 1 1 1 1 1 ...
Using tapply function
tapply(USDA$Iron, USDA$Highsodium, mean, na.rm=TRUE)
## 0 1
## 2.332241 3.914451
tapply(USDA$Iron, USDA$Highsodium, max, na.rm=TRUE)
## 0 1
## 123.60 87.47
tapply(USDA$Iron, USDA$Highsodium, summary, na.rm=TRUE)
## $`0`
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.460 1.290 2.332 2.448 123.600 30
##
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.6375 1.4000 3.9140 3.2000 87.4700 50