library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(hexbin)
library(RColorBrewer)
data = read.csv("C:/Users/londeree.4/Desktop/Taste-HealthPilot.csv")
summary(data)
## Qnum Image H_AML H_ST
## 1_Q6 : 1 Min. : 1.0 Min. :1.000 Min. :1.000
## 10_Q6 : 1 1st Qu.:100.5 1st Qu.:2.000 1st Qu.:3.000
## 100_Q6 : 1 Median :190.0 Median :4.000 Median :4.000
## 101_Q6 : 1 Mean :191.5 Mean :4.057 Mean :4.131
## 102_Q6 : 1 3rd Qu.:281.5 3rd Qu.:6.000 3rd Qu.:6.000
## 103_Q6 : 1 Max. :377.0 Max. :7.000 Max. :7.000
## (Other):329
## H_Diff T_AML T_ST T_Diff
## Min. :0.000 Min. :1.000 Min. :1.00 Min. :0.000
## 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:0.000
## Median :1.000 Median :2.000 Median :3.00 Median :1.000
## Mean :1.227 Mean :2.296 Mean :2.97 Mean :1.433
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:4.00 3rd Qu.:2.000
## Max. :6.000 Max. :7.000 Max. :7.00 Max. :6.000
##
A low score on health actually means a high health score. The same is true of taste. let’s reverse this to make the graphing more interpretable.
data$RH_AML <- abs(data$H_AML-8)
data$RH_ST <- abs(data$H_ST-8)
data$RT_AML <- abs(data$T_AML-8)
data$RT_ST <- abs(data$T_ST-8)
First lets look at the raw score of the data
What if we plotted some of the difference scores to see what images are good or bad.
This looks pretty good, most of the responses are within 1 or two of each devation.
Let’s keep only the ones that we have within 2 ratings numbers for.
data2 <- filter(data, H_Diff<3)
dataClose <- filter(data2, T_Diff<3)
summary(dataClose)
## Qnum Image H_AML H_ST
## 1_Q6 : 1 Min. : 1.0 Min. :1.000 Min. :1.000
## 101_Q6 : 1 1st Qu.:103.5 1st Qu.:3.000 1st Qu.:3.000
## 105_Q6 : 1 Median :191.0 Median :5.000 Median :4.000
## 106_Q6 : 1 Mean :192.8 Mean :4.292 Mean :4.259
## 109_Q6 : 1 3rd Qu.:290.5 3rd Qu.:6.000 3rd Qu.:6.000
## 11_Q6 : 1 Max. :377.0 Max. :7.000 Max. :7.000
## (Other):237
## H_Diff T_AML T_ST T_Diff
## Min. :0.0000 Min. :1.000 Min. :1.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000
## Median :1.0000 Median :2.000 Median :2.000 Median :1.0000
## Mean :0.9136 Mean :2.325 Mean :2.469 Mean :0.9588
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:2.0000
## Max. :2.0000 Max. :7.000 Max. :7.000 Max. :2.0000
##
## RH_AML RH_ST RT_AML RT_ST
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:5.000 1st Qu.:5.000
## Median :3.000 Median :4.000 Median :6.000 Median :6.000
## Mean :3.708 Mean :3.741 Mean :5.675 Mean :5.531
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:7.000 3rd Qu.:7.000
## Max. :7.000 Max. :7.000 Max. :7.000 Max. :7.000
##
ggplot(dataClose, aes(x=dataClose$H_Diff, y=dataClose$T_Diff)) +
geom_bin2d()+
scale_fill_gradientn(limits=c(0,30), breaks=seq(0, 30, by=10), colours=rainbow(4))
Nice and clean!
As you can see, this cuts a little under 100 images from our data set which will reduce time emensely. Let’s replot with these values
this is ok but it looks a little weird. Not sure if there is enought varability in this. we also loose 2 quadrants.
Just to be sure, let’s create an average and look at that
dataClose$H_Ave <- (dataClose$RH_ST+dataClose$RH_AML)/2
dataClose$T_Ave <- (dataClose$RT_ST+dataClose$RT_AML)/2
ggplot(dataClose, aes(x=dataClose$H_Ave, y=dataClose$T_Ave)) +
geom_bin2d()+
scale_fill_gradientn(limits=c(0,30), breaks=seq(0, 30, by=10), colours=rainbow(4))+
xlab("Health Rating")+
ylab("Taste Rating")+
ggtitle("Average Plot")
Pretty good, but clearly missing the low taste, low health quadrant.
Now lets separate the lists and see how many are in each quadrant
dataUH_UT <- filter(dataClose, H_Ave<=4)
dataUH_UT <- filter(dataUH_UT, T_Ave<=4)
dataH_T <- filter(dataClose, H_Ave>4)
dataH_T <- filter(dataH_T, T_Ave>4)
dataUH_T <- filter(dataClose, H_Ave<=4)
dataUH_T <- filter(dataUH_T, T_Ave>4)
dataH_UT <- filter(dataClose, H_Ave>4)
dataH_UT <- filter(dataH_UT, T_Ave<=4)
#What if we looked at Shravan's raw data
dataUH_UT <- filter(data, RH_ST<=4)
dataUH_UT <- filter(dataUH_UT, RT_ST<=4)
dataH_T <- filter(data, RH_ST>4)
dataH_T <- filter(dataH_T, RT_ST>4)
dataUH_T <- filter(data, RH_ST<=4)
dataUH_T <- filter(dataUH_T, RT_ST>4)
dataH_UT <- filter(data, RH_ST>4)
dataH_UT <- filter(dataH_UT, RT_ST<=4)
#What if we looked at Allison's raw data
dataUH_UT <- filter(data, RH_AML<=4)
dataUH_UT <- filter(dataUH_UT, RT_AML<=4)
dataH_T <- filter(data, RH_AML>4)
dataH_T <- filter(dataH_T, RT_AML>4)
dataUH_T <- filter(data, RH_AML<=4)
dataUH_T <- filter(dataUH_T, RT_AML>4)
dataH_UT <- filter(data, RH_AML>4)
dataH_UT <- filter(dataH_UT, RT_AML<=4)
Ok, not a ton of overlap, what if we tried to narrow down on averages? Of the full data set?
data$H_Ave_Full <- (data$RH_ST + data$RH_AML)/2
data$T_Ave_Full <- (data$RT_ST + data$RT_AML)/2
data$H_SD_Full <- sd(c(data$RH_ST, data$RH_AML), na.rm = F)
data$T_SD_Full <- (data$RT_ST + data$RT_AML)/2
sd(c(data$RH_ST, data$RH_AML), na.rm = F)
## [1] 2.035953
ggplot(data, aes(x=data$H_Ave_Full, y=data$T_Ave_Full)) +
geom_bin2d()+
scale_fill_gradientn(limits=c(0,30), breaks=seq(0, 30, by=10), colours=rainbow(4))+
xlab("Health Rating")+
ylab("Taste Rating")+
ggtitle("Full Average Plot")
dataUH_UT <- filter(data, H_Ave_Full<=4)
dataUH_UT <- filter(dataUH_UT, T_Ave_Full<=4)
dataH_T <- filter(data, H_Ave_Full>4)
dataH_T <- filter(dataH_T, T_Ave_Full>4)
dataUH_T <- filter(data, H_Ave_Full<=4)
dataUH_T <- filter(dataUH_T, T_Ave_Full>4)
dataH_UT <- filter(data, H_Ave_Full>4)
dataH_UT <- filter(dataH_UT, T_Ave_Full<=4)