Data downloaded from Kaggle.
https://www.kaggle.com/datasets/utsavdey1410/food-nutrition-dataset
Clear the workspace and read the read.csv help files
remove(list=ls())
?read.csv # open the help file
My original import commands - you have to change the file path to your working directory
FOOD.DATA.GROUP1 <- read.csv("~/Dropbox/WCAS/Summer/Data Analysis/Summer 2024/Day 2/FINAL FOOD DATASET/FOOD-DATA-GROUP1.csv") # you need to identifiy the key argument in the command
FOOD.DATA.GROUP1 <- read.csv(file = "~/Dropbox/WCAS/Summer/Data Analysis/Summer 2024/Day 2/FINAL FOOD DATASET/FOOD-DATA-GROUP1.csv") # better coding practice is to specify the key argument
FOOD.DATA.GROUP2 <- read.csv(file = "~/Dropbox/WCAS/Summer/Data Analysis/Summer 2024/Day 2/FINAL FOOD DATASET/FOOD-DATA-GROUP1.csv",
header = TRUE) # explicitly specifying the default argument does not chnage anything, but might be a good practice when you are new
FOOD.DATA.GROUP2 <- read.csv(file = "~/Dropbox/WCAS/Summer/Data Analysis/Summer 2024/Day 2/FINAL FOOD DATASET/FOOD-DATA-GROUP1.csv",
header = FALSE)
This piece of code is better as you do not have to change the file
path and can simply run the code without making any changes, as long as
your maintain the original folder (do not delete the subfolder
FINAL FOOD DATASET.
FOOD.DATA.GROUP1 <- read.csv("FINAL FOOD DATASET/FOOD-DATA-GROUP1.csv") # you need to identfiy the key argument in the command
FOOD.DATA.GROUP2 <- read.csv(file = "FINAL FOOD DATASET/FOOD-DATA-GROUP1.csv",
header = FALSE)
We will create a healthy and unhealthy panel of food, based on
Nutrition.Density values.
?remove
remove(FOOD.DATA.GROUP2)
FOOD.DATA.GROUP1$X <- NULL
FOOD.DATA.GROUP1$healthy <- FOOD.DATA.GROUP1$Nutrition.Density > 100
df_healthy_food <- FOOD.DATA.GROUP1[FOOD.DATA.GROUP1$healthy, ]
df_unhealthy_food <- FOOD.DATA.GROUP1[!FOOD.DATA.GROUP1$healthy, ]
You can play with the nrows argument to import only a
small subset of the original data.
FOOD.DATA.GROUP2 <- read.csv(file = "~/Dropbox/WCAS/Summer/Data Analysis/Summer 2024/Day 2/FINAL FOOD DATASET/FOOD-DATA-GROUP1.csv",
header = T,
nrows = 10 )
We will use the psych package. Use the
stargazer package if you wish instead.
# install.packages("psych") # installation - only once
library(psych)
describe(df_healthy_food)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad min max
## Unnamed..0 1 246 248.69 160.28 246.00 244.26 199.41 1.00 550.0
## food* 2 246 123.50 71.16 123.50 123.50 91.18 1.00 246.0
## Caloric.Value 3 246 373.58 211.94 347.50 348.85 172.72 71.00 1578.0
## Fat 4 246 18.22 15.04 14.60 16.04 12.45 0.00 87.5
## Saturated.Fats 5 246 6.36 6.46 4.75 5.30 4.67 0.00 43.5
## Monounsaturated.Fats 6 246 6.20 5.99 4.55 5.29 4.45 0.00 34.2
## Polyunsaturated.Fats 7 246 3.51 4.64 1.95 2.68 2.00 0.00 40.1
## Carbohydrates 8 246 24.36 24.56 22.70 21.47 32.02 0.00 128.3
## Sugars 9 246 4.41 10.29 0.02 2.03 0.03 0.00 70.8
## Protein 10 246 27.56 21.77 20.65 24.18 16.01 2.00 86.9
## Dietary.Fiber 11 246 1.56 2.67 0.00 0.97 0.00 0.00 17.5
## Cholesterol 12 246 92.86 81.35 70.40 83.32 71.68 0.00 352.5
## Sodium 13 246 0.78 0.63 0.70 0.72 0.59 0.00 6.1
## Water 14 246 126.26 83.34 108.60 119.40 82.43 1.10 489.3
## Vitamin.A 15 246 0.10 0.22 0.05 0.06 0.07 0.00 2.1
## Vitamin.B1 16 246 0.25 0.25 0.20 0.21 0.18 0.00 1.9
## Vitamin.B11 17 246 0.07 0.11 0.06 0.06 0.05 0.00 1.3
## Vitamin.B12 18 246 0.04 0.04 0.04 0.04 0.04 0.00 0.4
## Vitamin.B2 19 246 0.32 0.35 0.30 0.28 0.15 0.00 3.8
## Vitamin.B3 20 246 5.85 7.83 3.30 4.12 3.71 0.00 57.8
## Vitamin.B5 21 246 1.32 2.56 0.80 0.92 0.74 0.00 31.4
## Vitamin.B6 22 246 0.40 0.57 0.20 0.28 0.22 0.00 4.3
## Vitamin.C 23 246 3.37 6.62 0.40 1.77 0.59 0.00 42.9
## Vitamin.D 24 246 0.18 1.93 0.00 0.01 0.00 0.00 29.3
## Vitamin.E 25 246 0.90 1.67 0.20 0.53 0.30 0.00 14.1
## Vitamin.K 26 246 0.06 0.28 0.01 0.02 0.01 0.00 3.1
## Calcium 27 246 189.19 220.75 115.20 136.22 87.10 0.00 1283.5
## Copper 28 246 5.36 48.74 0.20 0.19 0.15 0.00 668.6
## Iron 29 246 2.41 2.20 2.10 2.14 1.48 0.01 21.1
## Magnesium 30 246 57.45 56.44 42.15 48.15 31.21 0.00 376.2
## Manganese 31 246 1.82 10.66 0.20 0.27 0.26 0.00 111.6
## Phosphorus 32 246 363.23 274.03 297.40 325.62 212.68 0.00 1385.1
## Potassium 33 246 516.38 454.11 373.70 449.24 310.83 0.00 3198.4
## Selenium 34 246 16.88 127.12 0.05 0.05 0.05 0.00 1351.6
## Zinc 35 246 2.76 9.47 1.60 1.84 1.04 0.00 147.3
## Nutrition.Density 36 246 266.76 224.81 201.44 212.55 105.03 100.10 1337.0
## healthy 37 246 NaN NA NA NaN NA Inf -Inf
## range skew kurtosis se
## Unnamed..0 549.00 0.18 -1.22 10.22
## food* 245.00 0.00 -1.21 4.54
## Caloric.Value 1507.00 1.83 6.25 13.51
## Fat 87.50 1.57 3.08 0.96
## Saturated.Fats 43.50 2.40 8.63 0.41
## Monounsaturated.Fats 34.20 1.93 5.13 0.38
## Polyunsaturated.Fats 40.10 4.06 24.95 0.30
## Carbohydrates 128.30 1.05 1.55 1.57
## Sugars 70.80 4.03 17.86 0.66
## Protein 84.90 1.21 0.54 1.39
## Dietary.Fiber 17.50 2.58 8.66 0.17
## Cholesterol 352.50 0.93 -0.04 5.19
## Sodium 6.10 2.78 18.96 0.04
## Water 488.20 0.96 1.33 5.31
## Vitamin.A 2.10 6.07 44.77 0.01
## Vitamin.B1 1.90 2.78 12.83 0.02
## Vitamin.B11 1.30 7.12 66.91 0.01
## Vitamin.B12 0.40 3.05 23.41 0.00
## Vitamin.B2 3.80 5.98 50.89 0.02
## Vitamin.B3 57.80 2.75 9.74 0.50
## Vitamin.B5 31.40 8.04 82.46 0.16
## Vitamin.B6 4.30 2.99 11.99 0.04
## Vitamin.C 42.90 3.28 12.66 0.42
## Vitamin.D 29.30 14.18 209.05 0.12
## Vitamin.E 14.10 3.78 19.65 0.11
## Vitamin.K 3.10 9.00 85.51 0.02
## Calcium 1283.50 2.59 6.51 14.07
## Copper 668.60 11.67 145.88 3.11
## Iron 21.09 3.68 23.60 0.14
## Magnesium 376.20 2.86 10.92 3.60
## Manganese 111.60 7.71 63.19 0.68
## Phosphorus 1385.10 1.30 1.55 17.47
## Potassium 3198.40 1.71 4.54 28.95
## Selenium 1351.60 8.33 73.33 8.10
## Zinc 147.30 14.47 217.48 0.60
## Nutrition.Density 1236.90 2.49 5.92 14.33
## healthy -Inf NA NA NA
COMMENT ON THE RESULTS….
read.csv help files gives us some instructions on how to
use the command.
## using count.fields to handle unknown maximum number of fields
## when fill = TRUE
test1 <- c(1:5, "6,7", "8,9,10")
tf <- tempfile()
writeLines(test1, tf)
read.csv(tf, fill = TRUE) # 1 column
ncol <- max(count.fields(tf, sep = ","))
read.csv(tf, fill = TRUE, header = FALSE,
col.names = paste0("V", seq_len(ncol)))
unlink(tf)
## "Inline" data set, using text=
## Notice that leading and trailing empty lines are auto-trimmed
read.table(header = TRUE, text = "
a b
1 2
3 4
")