###
# carbon-footprint-exploration.R
###
setwd("~/Documents/kaggle/food/")
df = read.csv("./data/sample10K_blankNAs.csv", header = TRUE, stringsAsFactors = FALSE)
head(df)
## code product_name
## 1 3.421557e+12 Terres et Céréales bio Couscous Complet
## 2 8.480018e+12 Tableta de chocolate negro con avellanas 55% cacao
## 3 5.019124e+12 Butter ghee (beurre clarifié)
## 4 3.555081e+12 nonettes aux miel et aux amandes
## 5 3.564700e+12 P'tit-beurre tablette chocolat au lait
## 6 3.250392e+12 Caviar d'aubergine
## nutrition_grade_fr countries_en proteins_100g carbohydrates_100g
## 1 France NA NA
## 2 e Spain 7.5 40
## 3 Portugal NA NA
## 4 d France 3.8 73
## 5 e France 6.5 65
## 6 France NA NA
## sugars_100g trans_fat_100g carbon_footprint_100g
## 1 NA NA NA
## 2 37 NA NA
## 3 NA NA NA
## 4 45 NA NA
## 5 39 NA NA
## 6 NA NA NA
Sine many of the fields are missing and not relevant for the classification of food grade, we will only select the relevant columns.
df.grades = df[,c("nutrition_grade_fr", "proteins_100g", "carbohydrates_100g", "sugars_100g", "trans_fat_100g")]
Lets now see code the target variable (nutrition_grade_fr) as a Factor variable and compute the summary statistics:
df.grades$nutrition_grade_fr = as.factor(df.grades$nutrition_grade_fr)
summary(df.grades)
## nutrition_grade_fr proteins_100g carbohydrates_100g sugars_100g
## :5216 Min. : 0.000 Min. : 0.00 Min. : 0.00
## a: 892 1st Qu.: 1.600 1st Qu.: 4.20 1st Qu.: 1.10
## b: 792 Median : 5.800 Median : 13.80 Median : 4.10
## c:1063 Mean : 7.436 Mean : 27.58 Mean : 12.77
## d:1242 3rd Qu.:10.100 3rd Qu.: 53.50 3rd Qu.: 15.40
## e: 795 Max. :86.000 Max. :100.80 Max. :100.80
## NA's :4483 NA's :4480 NA's :5027
## trans_fat_100g
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.113
## 3rd Qu.:0.000
## Max. :7.140
## NA's :9803
It appears that many of the target variables are blank (missing). Let’s remove these
df.grades = subset(df.grades, nutrition_grade_fr != "")
df.grades = droplevels(df.grades)
summary(df.grades)
## nutrition_grade_fr proteins_100g carbohydrates_100g sugars_100g
## a: 892 Min. : 0.000 Min. : 0.00 Min. : 0.00
## b: 792 1st Qu.: 1.700 1st Qu.: 4.50 1st Qu.: 1.10
## c:1063 Median : 5.800 Median : 14.10 Median : 4.00
## d:1242 Mean : 7.432 Mean : 27.57 Mean : 12.62
## e: 795 3rd Qu.:10.000 3rd Qu.: 53.00 3rd Qu.: 15.50
## Max. :86.000 Max. :100.80 Max. :100.80
## NA's :16 NA's :20 NA's :16
## trans_fat_100g
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.128
## 3rd Qu.:0.000
## Max. :7.140
## NA's :4612
library(ggplot2)
df.binary = droplevels(subset(df.grades, nutrition_grade_fr == "a" | nutrition_grade_fr == "e"))
ggplot(df.binary, aes(x = log10(1+proteins_100g), y = log10(1+sugars_100g), colour = nutrition_grade_fr)) + geom_point(size = .5)
library(rpart)
fit <- rpart(nutrition_grade_fr ~ ., method="class", data=droplevels(df.binary), maxdepth= 2)
library(rpart.plot)
rpart.plot(fit, type = 4, extra = 2)
table(df.binary$nutrition_grade_fr)
##
## a e
## 892 795
first.split.right = df.binary[df.binary$sugars_100g >= 20.65,]
table(first.split.right$nutrition_grade_fr)
##
## a e
## 14 464
first.split.left = df.binary[df.binary$sugars_100g < 20.65,]
table(first.split.left$nutrition_grade_fr)
##
## a e
## 878 331
fit <- rpart(nutrition_grade_fr ~ ., method="class", data=droplevels(df.binary))
rpart.plot(fit, type = 4, extra = 2)
ggplot(df.grades, aes(x = log10(1+proteins_100g), y = log10(1+sugars_100g), colour = nutrition_grade_fr)) + geom_point(size = .5)
## Warning: Removed 16 rows containing missing values (geom_point).
Now let’s split the data into a random (75%) training and (25%) test set.
train_ind = sample(nrow(df.grades), size = .75*nrow(df.grades))
df.grades.train = df.grades[train_ind, ]
df.grades.test = df.grades[-train_ind,]
Now lets build the decision tree!
df.grades$nutrition_grade_fr = as.factor(df.grades$nutrition_grade_fr)
fit <- rpart(nutrition_grade_fr ~ ., method="class", data=df.grades)
printcp(fit) # display the results
##
## Classification tree:
## rpart(formula = nutrition_grade_fr ~ ., data = df.grades, method = "class")
##
## Variables actually used in tree construction:
## [1] carbohydrates_100g proteins_100g sugars_100g
##
## Root node error: 3526/4768 = 0.73951
##
## n=4768 (16 observations deleted due to missingness)
##
## CP nsplit rel error xerror xstd
## 1 0.043392 0 1.00000 1.00000 0.0085951
## 2 0.041690 1 0.95661 0.96086 0.0088810
## 3 0.030062 3 0.87323 0.88060 0.0093331
## 4 0.013046 4 0.84317 0.84628 0.0094765
## 5 0.012479 5 0.83012 0.83607 0.0095137
## 6 0.010777 6 0.81764 0.82728 0.0095438
## 7 0.010493 8 0.79609 0.82076 0.0095650
## 8 0.010000 9 0.78559 0.81452 0.0095843
rpart.plot(fit, type = 4, extra = 2) # visualize cross-validation results
# plot tree
rpart.plot(fit, type = 4, extra = 2)
require(class)
## Loading required package: class
df.grades.test.simple = na.omit(df.grades.test[,c("nutrition_grade_fr", "proteins_100g", "sugars_100g")])
df.grades.train.simple = na.omit(df.grades.train[,c("nutrition_grade_fr", "proteins_100g", "sugars_100g")])
df.grades.test.simple = droplevels(subset(df.grades.test.simple, nutrition_grade_fr %in% c("a", "b")))
df.grades.train.simple = droplevels(subset(df.grades.train.simple, nutrition_grade_fr %in% c("a", "b")))
classif <- knn(df.grades.train.simple[,-1], df.grades.test.simple[,-1], cl = df.grades.train.simple$nutrition_grade_fr, k = 3, prob=TRUE)
prob <- attr(classif, "prob")
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dataf <- bind_rows(mutate(df.grades.test.simple,
prob=prob,
cls="a",
prob_cls=ifelse(classif==cls,
1, 0)),
mutate(df.grades.test.simple,
prob=prob,
cls="e",
prob_cls=ifelse(classif==cls,
1, 0)))
ggplot(dataf) +
geom_point(aes(x=proteins_100g, y=sugars_100g, col=cls),
data = mutate(df.grades.test.simple, cls=classif),
size=1.2) +
geom_contour(aes(x=proteins_100g, y=sugars_100g, z=prob_cls, group=cls, color=cls),
bins=2,
data=dataf)
## Warning: Computation failed in `stat_contour()`:
## (list) object cannot be coerced to type 'double'
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.2.4
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
#scatterplot of x and y variables
scatter <- ggplot(df.binary,aes(log10(1+proteins_100g), log10(1+sugars_100g))) +
geom_point(aes(color=nutrition_grade_fr)) +
theme(legend.position = "none") + geom_rug(aes(col=nutrition_grade_fr,alpha=.1))
#marginal density of x - plot on top
plot_top <- ggplot(df.binary, aes(log10(1+proteins_100g), fill=nutrition_grade_fr)) +
geom_density(alpha=.5) +
theme(legend.position = "none")
#marginal density of y - plot on the right
plot_right <- ggplot(df.binary, aes(log10(1+sugars_100g), fill=nutrition_grade_fr)) +
geom_density(alpha=.5) +
coord_flip() +
theme(legend.position = "none")
#placeholder plot - prints nothing at all
empty <- ggplot()+geom_point(aes(1,1), colour="white") +
theme(
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank()
)
#arrange the plots together, with appropriate height and width for each row and column
grid.arrange(plot_top, empty, scatter, plot_right, ncol=2, nrow=2, widths=c(4, 1), heights=c(1, 4))