#install.packages("Stat2Data")
#install.packages("tidyverse")
library (Stat2Data)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v stringr 1.4.0
## v tidyr 1.1.0 v forcats 0.5.0
## v readr 1.3.1
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
urlfile <- "https://raw.githubusercontent.com/okhaimova/CerealR/master/Cereal.csv"
Cereal <- read_csv(urlfile)
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## Cereal = col_character(),
## Calories = col_double(),
## Sugar = col_double(),
## Fiber = col_double()
## )
#require(Stat2Data), to be used if loading data from package
#data(Cereal)
summary(Cereal)
## X1 Cereal Calories Sugar
## Min. : 1.00 Length:36 Min. : 50.0 Min. : 0.000
## 1st Qu.: 9.75 Class :character 1st Qu.: 90.0 1st Qu.: 1.750
## Median :18.50 Mode :character Median :104.0 Median : 5.000
## Mean :18.50 Mean :101.6 Mean : 5.714
## 3rd Qu.:27.25 3rd Qu.:110.0 3rd Qu.: 9.075
## Max. :36.00 Max. :160.0 Max. :15.000
## Fiber
## Min. : 0.000
## 1st Qu.: 1.000
## Median : 3.000
## Mean : 3.592
## 3rd Qu.: 4.250
## Max. :14.000
mean(Cereal$Calories)
## [1] 101.6028
median(Cereal$Calories)
## [1] 104
mean(Cereal$Sugar)
## [1] 5.713889
median(Cereal$Sugar)
## [1] 5
mean(Cereal$Fiber)
## [1] 3.591667
median(Cereal$Fiber)
## [1] 3
# to store as a vector to compare later
x <- c(mean(Cereal$Calories), median(Cereal$Calories), mean(Cereal$Sugar), median(Cereal$Sugar))
cerealDF <- subset(Cereal, Calories >= 100, c(Cereal, Calories, Sugar))
colnames(cerealDF) <- c("Brand", "Cal", "SugarContent")
summary(cerealDF)
## Brand Cal SugarContent
## Length:25 Min. :100.0 Min. : 0.00
## Class :character 1st Qu.:100.0 1st Qu.: 3.00
## Mode :character Median :110.0 Median : 6.00
## Mean :112.3 Mean : 6.52
## 3rd Qu.:110.0 3rd Qu.:11.00
## Max. :160.0 Max. :15.00
mean(cerealDF$Cal)
## [1] 112.32
median(cerealDF$Cal)
## [1] 110
mean(cerealDF$SugarContent)
## [1] 6.52
median(cerealDF$SugarContent)
## [1] 6
# to store as a vector to compare
y <- c(mean(cerealDF$Cal), median(cerealDF$Cal), mean(cerealDF$SugarContent), median(cerealDF$SugarContent))
#create a data frame of the vectors we created and compare
CerealStat <- data.frame(x,y,x-y)
rownames(CerealStat) <- c("Average Calories", "Median Calories", "Average Sugar", "Median Sugar")
colnames(CerealStat) <- c("Cereal", "cerealDF", "Difference")
CerealStat
## Cereal cerealDF Difference
## Average Calories 101.602778 112.32 -10.7172222
## Median Calories 104.000000 110.00 -6.0000000
## Average Sugar 5.713889 6.52 -0.8061111
## Median Sugar 5.000000 6.00 -1.0000000
cerealDF$SugarLevel <- (cerealDF$SugarContent)
cerealDF$SugarLevel[cerealDF$SugarContent <= 5] <- "Good"
cerealDF$SugarLevel[cerealDF$SugarContent > 5 & cerealDF$SugarContent < 9 ] <- "Fair"
cerealDF$SugarLevel[cerealDF$SugarContent >= 9] <- "Bad"
head(cerealDF,5)
## # A tibble: 5 x 4
## Brand Cal SugarContent SugarLevel
## <chr> <dbl> <dbl> <chr>
## 1 Common Sense Oat Bran 100 6 Fair
## 2 Product 19 100 3 Good
## 3 Just Right 140 9 Bad
## 4 Special K 110 3 Good
## 5 Oatbake Raisin Nut 110 8 Fair
BONUS – place the original .csv in a github file and have R read from the link. This will be a very useful skill as you progress in your data science education and career.