# Jeff Nieman  R Final Project

# Begin by choosing an interesting data set (and what is more interesting than chocolate cake?)

cakefile <- "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/lme4/cake.csv"
cake <- read.table (file = cakefile, header = TRUE, sep=',') 
head (cake,4)
##   X replicate recipe temperature angle temp
## 1 1         1      A         175    42  175
## 2 2         1      A         185    46  185
## 3 3         1      A         195    47  195
## 4 4         1      A         205    39  205
# Perform some basic transformations.  Transformation #1:  Begin by reducing the number of columns, then test.

caketest <- cake[c(1:5)]
head (caketest,4)
##   X replicate recipe temperature angle
## 1 1         1      A         175    42
## 2 2         1      A         185    46
## 3 3         1      A         195    47
## 4 4         1      A         205    39
# Transformation #2: Rename the column names, then test.

colnames(caketest) <- c("Test_Number", "Repeat", "Recipe_Creator", "Oven_Temperature", "Break_Angle")
head (caketest,4)
##   Test_Number Repeat Recipe_Creator Oven_Temperature Break_Angle
## 1           1      1              A              175          42
## 2           2      1              A              185          46
## 3           3      1              A              195          47
## 4           4      1              A              205          39
# Transformation #3: Add column that calculates temperature in Celsius, then test

caketest <- transform(caketest, Celsius_Temperature = round((Oven_Temperature - 32) * 5/9))
head (caketest,4)
##   Test_Number Repeat Recipe_Creator Oven_Temperature Break_Angle
## 1           1      1              A              175          42
## 2           2      1              A              185          46
## 3           3      1              A              195          47
## 4           4      1              A              205          39
##   Celsius_Temperature
## 1                  79
## 2                  85
## 3                  91
## 4                  96
# Transformation #4: Add country column, then test
cakecountry <- read.table ("file:///C:/Users/jenieman/Documents/GitHub/R-final/cake countries.csv", header = TRUE, sep=',')
cakenew <- cbind(caketest, cakecountry)
head (cakenew,4)
##   Test_Number Repeat Recipe_Creator Oven_Temperature Break_Angle
## 1           1      1              A              175          42
## 2           2      1              A              185          46
## 3           3      1              A              195          47
## 4           4      1              A              205          39
##   Celsius_Temperature Country
## 1                  79     USA
## 2                  85     CHE
## 3                  91     DEU
## 4                  96     FRA
# Transformation #5:  Filter to only  show rows where the temperature in Celsius is greater than 100, then test

cakefilter <- cakenew[cakenew$Celsius_Temperature > 100,]
head (cakefilter,4)
##    Test_Number Repeat Recipe_Creator Oven_Temperature Break_Angle
## 5            5      1              A              215          53
## 6            6      1              A              225          42
## 11          11      1              B              215          55
## 12          12      1              B              225          42
##    Celsius_Temperature Country
## 5                  102     GBR
## 6                  107     MEX
## 11                 102     USA
## 12                 107     CHE
# Transformation #6:  change abbreviations to names for recipe creator, then test

cakenew$Recipe_Creator <- gsub("A", "Anna", cakenew$Recipe_Creator)
cakenew$Recipe_Creator <- gsub("B", "Betsy", cakenew$Recipe_Creator)
cakenew$Recipe_Creator <- gsub("C", "Carrie",cakenew$Recipe_Creator)
head (cakenew,10)
##    Test_Number Repeat Recipe_Creator Oven_Temperature Break_Angle
## 1            1      1           Anna              175          42
## 2            2      1           Anna              185          46
## 3            3      1           Anna              195          47
## 4            4      1           Anna              205          39
## 5            5      1           Anna              215          53
## 6            6      1           Anna              225          42
## 7            7      1          Betsy              175          39
## 8            8      1          Betsy              185          46
## 9            9      1          Betsy              195          51
## 10          10      1          Betsy              205          49
##    Celsius_Temperature Country
## 1                   79     USA
## 2                   85     CHE
## 3                   91     DEU
## 4                   96     FRA
## 5                  102     GBR
## 6                  107     MEX
## 7                   79     JPN
## 8                   85     AUS
## 9                   91     IND
## 10                  96     RUS
# Key Question to investigate:  What causes the larger break angles?
# Create basic summary statistics.  

# summary #1: What is summary of temperature?

ot <- summary(cakenew$Oven_Temperature)
print(ot)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     175     185     200     200     215     225
# Summary #2: What is summary of angles?

ba <- summary(cakenew$Break_Angle)
print(ba)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   26.00   31.00   32.12   36.75   63.00
# Run some graphical analysis

#  Test #1: Look at break angle histogram
hist(cakenew$Break_Angle)

# Observation #1: Mean break angle is 32.12 degrees and distribution is fairly normal but does favor the high end a little (max is 63 degrees)

# Test #2:  Look at distribution of angles via a scatter plot looking at relationship between temperature and break angle

plot(Break_Angle~Oven_Temperature, data=cakenew)
cor(cakenew$Break_Angle, cakenew$Oven_Temperature)
## [1] 0.3292422
# Observation #2: the 7 highest angles are all from the 2 highest temperatures, although the highest temperature does not guarantee the highest break angle.  The correlation value of a smaller positive (0.329) shows a slight but not overly large correlation.  Need to investigate other variables as well.

#Test #3 - Create a layered scatter plot testing recipe creator
require(ggplot2)
## Loading required package: ggplot2

g<-ggplot(cakenew, aes(x=Break_Angle, y=Oven_Temperature))
g+geom_point(aes(color=Recipe_Creator))

# Observation #3: At first glance it appears that Carrie's recipe may contribute to a greater break angle, as Carrie has two of the highest angles. 

# Test #4: Plot break angle against recipe creators.

plot(Break_Angle~Recipe_Creator, data=caketest)

# Observation #4:  Carrie has two of the highest angles, but overall her mean is the same as Betsy.  Anna's recipe has a higher actual mean.

# Test #5:  Create a layered scatter plot testing repeat test

g+geom_point(aes(color=Repeat))

# Observation #5:  The later tests have the smaller break angles.

# Test #6:  Look at country influence.

require(rworldmap)
## Loading required package: rworldmap
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type :   vignette('rworldmap')
cakeworld <- joinCountryData2Map( cakenew, joinCode = "ISO3", nameJoinColumn = "Country")
## 270 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 234 codes from the map weren't represented in your data
mapCountryData(cakeworld, nameColumnToPlot="Break_Angle")

# Observation #6:  Russia and India have the highest average break angles.

# Final conclusions:  To get the highest break angle, it would seem to use an earlier test done in Russia or India at the highest temperature.