Write a short data mining report on the CogSci Intro Week Personality Test Data in which you answer the following questions in prose, code and graphs:
setwd("C:/Portfolios")
library(ggplot2); library(pastecs); library(Hmisc)
data = read.delim("CogSciPersonality2016.txt", stringsAsFactors = F)
#Including stringsAsFactors = F because otherwise the Ballon_rt times would not be displayed correctly
by(data$Hold_breath, data$Ocular_dominance, stat.desc) #using the by function with stat.desc to calculate the mean and standard deviation for all groups:
## Warning in qt((0.5 + p/2), (Nbrval - 1)): NaNs produced
## data$Ocular_dominance: Both
## nbr.val nbr.null nbr.na min max
## 1 0 0 56 56
## range sum median mean SE.mean
## 0 56 56 56 NA
## CI.mean.0.95 var std.dev coef.var
## NaN NA NA NA
## --------------------------------------------------------
## data$Ocular_dominance: Left
## nbr.val nbr.null nbr.na min max
## 23.0000000 0.0000000 0.0000000 18.0000000 124.0000000
## range sum median mean SE.mean
## 106.0000000 1215.6400000 47.0000000 52.8539130 4.3581006
## CI.mean.0.95 var std.dev coef.var
## 9.0381475 436.8399431 20.9007163 0.3954431
## --------------------------------------------------------
## data$Ocular_dominance: Right
## nbr.val nbr.null nbr.na min max
## 38.0000000 0.0000000 0.0000000 0.2700000 120.0000000
## range sum median mean SE.mean
## 119.7300000 1805.8700000 46.0000000 47.5228947 3.6365149
## CI.mean.0.95 var std.dev coef.var
## 7.3682791 502.5211400 22.4169833 0.4717091
#Creating a barplot comparing the mean breath-holding time between the ocular dominance groups
ggplot(data, aes(Ocular_dominance, Hold_breath, fill=Ocular_dominance)) +
geom_bar(stat="summary", fun.y=mean, color = "black") +
geom_errorbar(stat="summary", fun.data=mean_se, width = 0.4) +
labs(x = "Ocular Dominance", y = "Hold breath time in seconds", fill="Ocular Dominance") +
theme(legend.position = "none")
## Warning: Removed 1 rows containing missing values (geom_errorbar).
… your prose answer here …
Left: mean = 52.85, sd = 20.90 Right: mean = 47.52, sd = 22.42 Both: mean = 56, no sd since there is only 1 data point The people with left ocular dominance can on average hold their breath longer than people with right ocular dominance.
by(data$Volume, data$Gender, stat.desc) #using by function with stat.desc to compare volume leves between genders
## data$Gender: female
## nbr.val nbr.null nbr.na min max
## 40.0000000 2.0000000 0.0000000 0.0000000 72.0000000
## range sum median mean SE.mean
## 72.0000000 1116.0000000 30.0000000 27.9000000 2.6758176
## CI.mean.0.95 var std.dev coef.var
## 5.4123520 286.4000000 16.9233566 0.6065719
## --------------------------------------------------------
## data$Gender: male
## nbr.val nbr.null nbr.na min max
## 22.0000000 3.0000000 0.0000000 0.0000000 80.0000000
## range sum median mean SE.mean
## 80.0000000 393.0000000 15.0000000 17.8636364 3.6987745
## CI.mean.0.95 var std.dev coef.var
## 7.6920226 300.9805195 17.3487901 0.9711791
#Creating barplot comparing preferred noise levels between genders
ggplot(data, aes(Gender, Volume, fill=Gender)) +
geom_bar(stat="summary", fun.y=mean, color = "black") +
geom_errorbar(stat="summary", fun.data=mean_se, width = 0.4) +
labs(x = "Gender", y = "Volume level") +
theme(legend.position = "none")
… your prose answer here …
Females: mean volume level = 27.90, sd = 16.92 Males: mean volume level = 17.86, sd = 17.35 On average, females listen to louder music than males.
For both questions, provide mean and standard deviation for both groups and make a bar plot with error bars that illustrate the difference (if there is one).
#Creating a density histogram with a normal curve, na values removed
ggplot(data, aes(Hold_breath)) +
geom_histogram(aes(y = ..density..), fill = "white", color = "black", binwidth=4) +
stat_function(fun =dnorm, args =list(mean = mean(data$Hold_breath, na.rm =T), sd = sd(data$Hold_breath, na.rm=T)), color ="blue", size=1)
#Creating Q-Q-plot
qplot(sample = data$Hold_breath) +
labs(title= "Hold breath")
#Calculating skew.2SE, kurt.2SE and normtest.p
round(stat.desc(data$Hold_breath, F, norm=T),3)
## median mean SE.mean CI.mean.0.95 var
## 46.500 49.637 2.752 5.504 469.707
## std.dev coef.var skewness skew.2SE kurtosis
## 21.673 0.437 0.842 1.386 2.481
## kurt.2SE normtest.W normtest.p
## 2.070 0.925 0.001
#skew.2SE = 1.39, kurt.2SE = 2.07, normtest.p = 0.001
##We have two measurement errors - people who allegedly can't hold their breath for even 1 second
#Running the same analysis but with the measurement errors removed:
#Creating new data set without the measurement errors
Hold_breath1 = data[data$Hold_breath > 1,]
#Creating density histogram with a normal curve
ggplot(Hold_breath1, aes(Hold_breath)) +
geom_histogram(aes(y = ..density..), fill = "white", color = "black", binwidth=4) +
stat_function(fun =dnorm, args =list(mean = mean(Hold_breath1$Hold_breath, na.rm =T), sd = sd(Hold_breath1$Hold_breath, na.rm=T)), color ="blue", size=1)
#Creating Q-Q-plot
qplot(sample = Hold_breath1$Hold_breath) +
labs(title= "Hold breath no measurement errors")
#Calculating skew.2SE, kurt.2SE and normtest.p
round(stat.desc(Hold_breath1$Hold_breath, F, norm=T), 3)
## median mean SE.mean CI.mean.0.95 var
## 47.000 51.280 2.584 5.170 400.554
## std.dev coef.var skewness skew.2SE kurtosis
## 20.014 0.390 1.360 2.203 3.077
## kurt.2SE normtest.W normtest.p
## 2.528 0.892 0.000
… your prose answer here …
The hold breath data is not normally distributed. For the data with measurement errors, the normtest.p is 0.001 which is a result of the significant positive skew (skew.2SE = 1.386) and the kurtosis (kurt.2SE = 2.070). Furthermore, the Q-Q-plot is quite flat, - far away from the straight line of a normal distribution.
Even with the measurement erros removed, the data is still not normally distributed.There is both significant skew and kurtosis (skew.2SE = 2.203, kurt.2SE = 2.528) and the normtest.p is 0.000. Removing the measurement errors actually made the data resemble a normal distribution even less.
data$Ballon_rt = as.numeric(data$Ballon_rt) #Ballon_rt was not seen as numeric - making it so it is.
## Warning: NAs introduced by coercion
#Creating histogram with a normal curve
ggplot(data, aes(data$Ballon_rt)) +
geom_histogram(aes(y = ..density..), fill = "white", color = "black", binwidth=3) +
stat_function(fun =dnorm, args =list(mean = mean(data$Ballon_rt, na.rm =T), sd = sd(data$Ballon_rt, na.rm=T)), color ="blue", size=1)
## Warning: Removed 1 rows containing non-finite values (stat_bin).
#Creating Q-Q-plot
qplot(sample = data$Ballon_rt) +
labs(title = "Baloon RT")
## Warning: Removed 1 rows containing non-finite values (stat_qq).
round(stat.desc(data$Ballon_rt, F, norm=T),3)
## median mean SE.mean CI.mean.0.95 var
## 20.000 26.904 3.223 6.446 633.515
## std.dev coef.var skewness skew.2SE kurtosis
## 25.170 0.936 2.780 4.539 7.276
## kurt.2SE normtest.W normtest.p
## 6.025 0.590 0.000
#skew.2SE = 4.539
#kurt.2SE = 6.025
#normtest.p = 0.000.
… your prose answer here …
The baloon reaction time data is not normally distributed. There is a lot of positive skew (skew.2SE = 4.539) and lots of kurtosis as well (kurt.2SE = 6.025). The normtest.p is 0.00 indicating that this is in no way a normal distribution.The Q-Q-plot supports this, since it’s almost just a flat line.
#Creating scatterplot showing shoe size on the x-axis and hold breath time on the y-axis. Adding a linear curve as well.
ggplot(data, aes(data$Shoe_size, data$Hold_breath)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Shoe Size", y = "Hold breath time in seconds")
#Doing the same with the hold breath data without measurement errors
ggplot(Hold_breath1, aes(Shoe_size, Hold_breath)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Shoe Size", y = "Hold breath time in seconds")
… your prose answer here …
There seem to be very weak or no correlation between shoe size and breath-holding time. The linear line goes up, indicating some correlation, but looking at the data it’s very slight. The data looks a bit better with the measurement errors removed but it’s still far from a linear relationship.
ggplot(data, aes(data$Shoe_size, data$Hold_breath, color = Gender)) + #adding color = Gender to split the data according to gender
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Shoe Size", y = "Hold breath time in seconds")
#Doing the same with the data without measurement errors
ggplot(Hold_breath1, aes(Shoe_size, Hold_breath, color = Gender)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Shoe Size", y = "Hold breath time in seconds")
… your prose answer here …
For the females there seems to be some correlation, but for males there is absolutely none. Without measurement errors the data looks a bit more correlated for females (but still not really), but for males there is still no signs of correlation at all.
Once the portfolio is filled in, you “knit” it, saves it and upload it to Blackboard under Experimental Methods > Assignments
Deadline 30 September 2016 at 8 p.m.