###
# carbon-footprint-exploration.R
###

setwd("~/Documents/kaggle/food/")

df = read.csv("./data/sample10K_blankNAs.csv", header = TRUE, stringsAsFactors = FALSE)
head(df)
##           code                                       product_name
## 1 3.421557e+12            Terres et Céréales bio Couscous Complet
## 2 8.480018e+12 Tableta de chocolate negro con avellanas 55% cacao
## 3 5.019124e+12                      Butter ghee (beurre clarifié)
## 4 3.555081e+12                   nonettes aux miel et aux amandes
## 5 3.564700e+12             P'tit-beurre tablette chocolat au lait
## 6 3.250392e+12                                 Caviar d'aubergine
##   nutrition_grade_fr countries_en proteins_100g carbohydrates_100g
## 1                          France            NA                 NA
## 2                  e        Spain           7.5                 40
## 3                        Portugal            NA                 NA
## 4                  d       France           3.8                 73
## 5                  e       France           6.5                 65
## 6                          France            NA                 NA
##   sugars_100g trans_fat_100g carbon_footprint_100g
## 1          NA             NA                    NA
## 2          37             NA                    NA
## 3          NA             NA                    NA
## 4          45             NA                    NA
## 5          39             NA                    NA
## 6          NA             NA                    NA

Sine many of the fields are missing and not relevant for the classification of food grade, we will only select the relevant columns.

df.grades = df[,c("nutrition_grade_fr", "proteins_100g", "carbohydrates_100g", "sugars_100g", "trans_fat_100g")]

Lets now see code the target variable (nutrition_grade_fr) as a Factor variable and compute the summary statistics:

df.grades$nutrition_grade_fr = as.factor(df.grades$nutrition_grade_fr)
summary(df.grades)
##  nutrition_grade_fr proteins_100g    carbohydrates_100g  sugars_100g    
##   :5216             Min.   : 0.000   Min.   :  0.00     Min.   :  0.00  
##  a: 892             1st Qu.: 1.600   1st Qu.:  4.20     1st Qu.:  1.10  
##  b: 792             Median : 5.800   Median : 13.80     Median :  4.10  
##  c:1063             Mean   : 7.436   Mean   : 27.58     Mean   : 12.77  
##  d:1242             3rd Qu.:10.100   3rd Qu.: 53.50     3rd Qu.: 15.40  
##  e: 795             Max.   :86.000   Max.   :100.80     Max.   :100.80  
##                     NA's   :4483     NA's   :4480       NA's   :5027    
##  trans_fat_100g 
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.113  
##  3rd Qu.:0.000  
##  Max.   :7.140  
##  NA's   :9803

It appears that many of the target variables are blank (missing). Let’s remove these

df.grades = subset(df.grades, nutrition_grade_fr != "")
df.grades = droplevels(df.grades)
summary(df.grades)
##  nutrition_grade_fr proteins_100g    carbohydrates_100g  sugars_100g    
##  a: 892             Min.   : 0.000   Min.   :  0.00     Min.   :  0.00  
##  b: 792             1st Qu.: 1.700   1st Qu.:  4.50     1st Qu.:  1.10  
##  c:1063             Median : 5.800   Median : 14.10     Median :  4.00  
##  d:1242             Mean   : 7.432   Mean   : 27.57     Mean   : 12.62  
##  e: 795             3rd Qu.:10.000   3rd Qu.: 53.00     3rd Qu.: 15.50  
##                     Max.   :86.000   Max.   :100.80     Max.   :100.80  
##                     NA's   :16       NA's   :20         NA's   :16      
##  trans_fat_100g 
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.128  
##  3rd Qu.:0.000  
##  Max.   :7.140  
##  NA's   :4612
library(ggplot2)
df.binary = droplevels(subset(df.grades, nutrition_grade_fr == "a" | nutrition_grade_fr == "e"))
ggplot(df.binary, aes(x = log10(1+proteins_100g), y = log10(1+sugars_100g), colour = nutrition_grade_fr)) + geom_point(size = .5)

library(rpart)
fit <- rpart(nutrition_grade_fr ~ ., method="class", data=droplevels(df.binary), maxdepth= 2)
library(rpart.plot)
rpart.plot(fit, type = 4, extra = 2)

table(df.binary$nutrition_grade_fr)
## 
##   a   e 
## 892 795
first.split.right = df.binary[df.binary$sugars_100g >= 20.65,]
table(first.split.right$nutrition_grade_fr)
## 
##   a   e 
##  14 464
first.split.left = df.binary[df.binary$sugars_100g < 20.65,]
table(first.split.left$nutrition_grade_fr)
## 
##   a   e 
## 878 331
fit <- rpart(nutrition_grade_fr ~ ., method="class", data=droplevels(df.binary))
rpart.plot(fit, type = 4, extra = 2)

ggplot(df.grades, aes(x = log10(1+proteins_100g), y = log10(1+sugars_100g), colour = nutrition_grade_fr)) + geom_point(size = .5)
## Warning: Removed 16 rows containing missing values (geom_point).

Now let’s split the data into a random (75%) training and (25%) test set.

train_ind = sample(nrow(df.grades), size = .75*nrow(df.grades))
df.grades.train = df.grades[train_ind, ] 
df.grades.test = df.grades[-train_ind,]

Now lets build the decision tree!

df.grades$nutrition_grade_fr = as.factor(df.grades$nutrition_grade_fr)
fit <- rpart(nutrition_grade_fr ~ ., method="class", data=df.grades)

printcp(fit) # display the results
## 
## Classification tree:
## rpart(formula = nutrition_grade_fr ~ ., data = df.grades, method = "class")
## 
## Variables actually used in tree construction:
## [1] carbohydrates_100g proteins_100g      sugars_100g       
## 
## Root node error: 3526/4768 = 0.73951
## 
## n=4768 (16 observations deleted due to missingness)
## 
##         CP nsplit rel error  xerror      xstd
## 1 0.043392      0   1.00000 1.00000 0.0085951
## 2 0.041690      1   0.95661 0.96086 0.0088810
## 3 0.030062      3   0.87323 0.88060 0.0093331
## 4 0.013046      4   0.84317 0.84628 0.0094765
## 5 0.012479      5   0.83012 0.83607 0.0095137
## 6 0.010777      6   0.81764 0.82728 0.0095438
## 7 0.010493      8   0.79609 0.82076 0.0095650
## 8 0.010000      9   0.78559 0.81452 0.0095843
rpart.plot(fit, type = 4, extra = 2) # visualize cross-validation results 

# plot tree 
rpart.plot(fit, type = 4, extra = 2)

require(class)
## Loading required package: class
df.grades.test.simple = na.omit(df.grades.test[,c("nutrition_grade_fr", "proteins_100g", "sugars_100g")])
df.grades.train.simple = na.omit(df.grades.train[,c("nutrition_grade_fr", "proteins_100g", "sugars_100g")])

df.grades.test.simple = droplevels(subset(df.grades.test.simple, nutrition_grade_fr %in% c("a", "b")))
df.grades.train.simple = droplevels(subset(df.grades.train.simple, nutrition_grade_fr %in% c("a", "b")))
classif <- knn(df.grades.train.simple[,-1], df.grades.test.simple[,-1], cl = df.grades.train.simple$nutrition_grade_fr, k = 3, prob=TRUE)
prob <- attr(classif, "prob")


 require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
 dataf <- bind_rows(mutate(df.grades.test.simple,
                           prob=prob,
                           cls="a",
                           prob_cls=ifelse(classif==cls,
                                           1, 0)),
                    mutate(df.grades.test.simple,
                           prob=prob,
                           cls="e",
                           prob_cls=ifelse(classif==cls,
                                           1, 0)))

 ggplot(dataf) +
    geom_point(aes(x=proteins_100g, y=sugars_100g, col=cls),
               data = mutate(df.grades.test.simple, cls=classif),
               size=1.2) + 
    geom_contour(aes(x=proteins_100g, y=sugars_100g, z=prob_cls, group=cls, color=cls),
                 bins=2,
                 data=dataf) 
## Warning: Computation failed in `stat_contour()`:
## (list) object cannot be coerced to type 'double'

library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.2.4
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
#scatterplot of x and y variables
scatter <- ggplot(df.binary,aes(log10(1+proteins_100g), log10(1+sugars_100g))) + 
  geom_point(aes(color=nutrition_grade_fr))  + 
  theme(legend.position = "none") + geom_rug(aes(col=nutrition_grade_fr,alpha=.1))

#marginal density of x - plot on top
plot_top <- ggplot(df.binary, aes(log10(1+proteins_100g), fill=nutrition_grade_fr)) + 
  geom_density(alpha=.5) + 
  theme(legend.position = "none")

#marginal density of y - plot on the right
plot_right <- ggplot(df.binary, aes(log10(1+sugars_100g), fill=nutrition_grade_fr)) + 
  geom_density(alpha=.5) + 
  coord_flip() + 
  theme(legend.position = "none")

#placeholder plot - prints nothing at all
empty <- ggplot()+geom_point(aes(1,1), colour="white") +
     theme(                              
       plot.background = element_blank(), 
       panel.grid.major = element_blank(), 
       panel.grid.minor = element_blank(), 
       panel.border = element_blank(), 
       panel.background = element_blank(),
       axis.title.x = element_blank(),
       axis.title.y = element_blank(),
       axis.text.x = element_blank(),
       axis.text.y = element_blank(),
       axis.ticks = element_blank()
     )
#arrange the plots together, with appropriate height and width for each row and column
grid.arrange(plot_top, empty, scatter, plot_right, ncol=2, nrow=2, widths=c(4, 1), heights=c(1, 4))