Task:

Now it is your turn. Use what you have learned from the iris dataset on a new one. The dataset is uploaded as a csv file on the following URL: http://goo.gl/HKnl74. This is simulated data from an amusement park. For this exercise, make sure to write the report in an rmarkdown format for nicer presentation and sharing.

Load data

setwd('/Users/milosjanicki/Data_projects/Date_Guide/')
apd <- read.csv('2.csv')

First Glance EDA

summary(apd)
##  weekend     num.child        distance            rides       
##  no :259   Min.   :0.000   Min.   :  0.5267   Min.   : 72.00  
##  yes:241   1st Qu.:0.000   1st Qu.: 10.3181   1st Qu.: 82.00  
##            Median :2.000   Median : 19.0191   Median : 86.00  
##            Mean   :1.738   Mean   : 31.0475   Mean   : 85.85  
##            3rd Qu.:3.000   3rd Qu.: 39.5821   3rd Qu.: 90.00  
##            Max.   :5.000   Max.   :239.1921   Max.   :100.00  
##      games             wait           clean          overall      
##  Min.   : 57.00   Min.   : 40.0   Min.   : 74.0   Min.   :  6.00  
##  1st Qu.: 73.00   1st Qu.: 62.0   1st Qu.: 84.0   1st Qu.: 40.00  
##  Median : 78.00   Median : 70.0   Median : 88.0   Median : 50.00  
##  Mean   : 78.67   Mean   : 69.9   Mean   : 87.9   Mean   : 51.26  
##  3rd Qu.: 85.00   3rd Qu.: 77.0   3rd Qu.: 91.0   3rd Qu.: 62.00  
##  Max.   :100.00   Max.   :100.0   Max.   :100.0   Max.   :100.00
str(apd)
## 'data.frame':    500 obs. of  8 variables:
##  $ weekend  : Factor w/ 2 levels "no","yes": 2 2 1 2 1 1 2 1 1 2 ...
##  $ num.child: int  0 2 1 0 4 5 1 0 0 3 ...
##  $ distance : num  114.6 27 63.3 25.9 54.7 ...
##  $ rides    : int  87 87 85 88 84 81 77 82 90 88 ...
##  $ games    : int  73 78 80 72 87 79 73 70 88 86 ...
##  $ wait     : int  60 76 70 66 74 48 58 70 79 55 ...
##  $ clean    : int  89 87 88 89 87 79 85 83 95 88 ...
##  $ overall  : int  47 65 61 37 68 27 40 30 58 36 ...

Histograms, Corellation and Shapiro-Test for all variables

par(mar=c(1,1,1,1)+0)

attach(apd)
par(mfrow=c(2,3), mai = c(1, 0.1, 0.1, 0.1))
hist(num.child)
hist(distance)
hist(rides)
qqnorm(num.child); qqline(num.child, col = 2)
qqnorm(distance); qqline(distance, col = 2)
qqnorm(rides); qqline(rides,col=2)

shapiro.test(rides)
## 
##  Shapiro-Wilk normality test
## 
## data:  rides
## W = 0.9915, p-value = 0.005945
shapiro.test(num.child)
## 
##  Shapiro-Wilk normality test
## 
## data:  num.child
## W = 0.8885, p-value < 2.2e-16
shapiro.test(distance)
## 
##  Shapiro-Wilk normality test
## 
## data:  distance
## W = 0.7358, p-value < 2.2e-16
par(mfrow=c(2,3), mai = c(1, 0.1, 0.1, 0.1))
hist(games)
hist(wait)
hist(clean)
qqnorm(games); qqline(games, col = 2)
qqnorm(wait); qqline(wait, col = 2)
qqnorm(clean); qqline(clean,col=2)

shapiro.test(games)
## 
##  Shapiro-Wilk normality test
## 
## data:  games
## W = 0.9942, p-value = 0.05255
shapiro.test(wait)
## 
##  Shapiro-Wilk normality test
## 
## data:  wait
## W = 0.9955, p-value = 0.1629
shapiro.test(clean)
## 
##  Shapiro-Wilk normality test
## 
## data:  clean
## W = 0.9918, p-value = 0.007461
hist(overall)
qqnorm(overall); qqline(overall, col = 2)
shapiro.test(overall)
## 
##  Shapiro-Wilk normality test
## 
## data:  overall
## W = 0.9947, p-value = 0.08358

All distrubtions look normal but from the Shapiro-Wilk test we can find out that games, wait and overall are most probably normally distributed

par(mfrow=c(1,1))
boxplot(apd[,3:8])

Fourth-quartile of distance is very spread out which suggests higher variance of the variable

pairs(apd)

library(GGally)

ggpairs(apd)

library(corrplot)
apd.correlations <- cor(apd[2:8]); apd.correlations
##              num.child     distance        rides        games         wait
## num.child  1.000000000 -0.012136454 -0.040260243  0.004658171 -0.020972921
## distance  -0.012136454  1.000000000 -0.009525919 -0.006502585 -0.007391617
## rides     -0.040260243 -0.009525919  1.000000000  0.455185111  0.314199509
## games      0.004658171 -0.006502585  0.455185111  1.000000000  0.299104980
## wait      -0.020972921 -0.007391617  0.314199509  0.299104980  1.000000000
## clean     -0.013451671  0.003968523  0.789565053  0.516979874  0.367884670
## overall    0.319480357  0.087126019  0.585986282  0.437467872  0.572621659
##                  clean    overall
## num.child -0.013451671 0.31948036
## distance   0.003968523 0.08712602
## rides      0.789565053 0.58598628
## games      0.516979874 0.43746787
## wait       0.367884670 0.57262166
## clean      1.000000000 0.63939818
## overall    0.639398176 1.00000000
corrplot(apd.correlations)

It seems there is a stronger correlation between clean and rides

prop.table(table(apd$weekend))
## 
##    no   yes 
## 0.518 0.482
barplot(table(apd$weekend))

prop.table(table(apd$num.child))
## 
##     0     1     2     3     4     5 
## 0.302 0.132 0.286 0.136 0.094 0.050
barplot(table(apd$num.child))

Visitors are coming more often on weekdays and are bringing either 2 or no children in most cases