#load dataset sb12-qr1.sav
library(foreign)
db = file.choose()
dataset = read.spss(db, to.data.frame = TRUE)
/Users/qumino2/Downloads/17_spring_courses/Quantitative_course_做Udacity/exercise_for_Heidi/exercise_data.sav: Unrecognized record type 7, subtype 18 encountered in system file/Users/qumino2/Downloads/17_spring_courses/Quantitative_course_做Udacity/exercise_for_Heidi/exercise_data.sav: Unrecognized record type 7, subtype 21 encountered in system file
table(dataset$Gender)
0 1
1382 715
Based on the result, I think ‘gender’ is a dichotomous variable.
#Histogram for Year_birth
ggplot(dataset, aes(Year_birth)) +
geom_histogram(breaks = seq(1950, 1995, by = 1),
col = "red",
fill = "green",
alpha = .2) +
labs(title = "Histogram for Year_birth") +
xlim(c(1949, 1996))
#qqplot for Year_birth
qqnorm(dataset$Year_birth); qqline(dataset$Year_birth, col =2)
shapiro.test(dataset$Year_birth)
Shapiro-Wilk normality test
data: dataset$Year_birth
W = 0.73774, p-value < 2.2e-16
Based on the result, not normally distributed.
#Histogram for credits_total
ggplot(dataset, aes(credits_total)) +
geom_histogram(binwidth = 10,
col = "red",
fill = "green",
alpha = .2) +
labs(title = "Histogram for credits_total")
#qqplot for credits_total
qqnorm(dataset$credits_total); qqline(dataset$credits_total, col =2)
shapiro.test(dataset$credits_total)
Shapiro-Wilk normality test
data: dataset$credits_total
W = 0.93642, p-value < 2.2e-16
Based on the result, not normally distributed.
#Histogram for q40
ggplot(dataset, aes(q40)) +
geom_histogram(binwidth = 1,
col = "red",
fill = "green",
alpha = .2) +
labs(title = "Histogram for q40")
NA
#qqplot for q40
qqnorm(dataset$q40); qqline(dataset$q40, col =2)
shapiro.test(dataset$q40)
Shapiro-Wilk normality test
data: dataset$q40
W = 0.74137, p-value < 2.2e-16
Based on the result, not normally distributed.
#a. gender and q12_2
dataset$Gender[dataset$Gender == 0] <- 'Female'
dataset$Gender[dataset$Gender == 1] <- 'Male'
a.tab <- table(dataset$Gender, dataset$q12_2)
a.tab
Strongly disagree Moderately disagree
Female 15 25
Male 11 23
Slightly disagree Undecided Slightly agree
Female 78 232 552
Male 58 110 291
Moderately agree Strongly agree
Female 299 170
Male 149 69
chisq.test(a.tab)
Pearson's Chi-squared test
data: a.tab
X-squared = 12.953, df = 6, p-value = 0.04379
Based on the result, there is a significant difference between male and female’s answers in q12_2
#b. gender and q15_7
b.tab <- table(dataset$Gender, dataset$q15_7)
b.tab
Strongly disagree Moderately disagree
Female 64 137
Male 20 72
Slightly disagree Undecided Slightly agree
Female 421 369 292
Male 174 196 180
Moderately agree Strongly agree
Female 74 22
Male 56 11
chisq.test(b.tab)
Pearson's Chi-squared test
data: b.tab
X-squared = 18.411, df = 6, p-value = 0.005283
Based on the result, there is a significant difference between male and female’s answers in q15_7
#c. credits_autumn and q12_1. Combine categories in credits_autumn so that you’ll end up
#with 3-4 different categories.
require(car)
dataset$credits_autumn <- recode(dataset$credits_autumn, "c('none', 'Less than 10', '10-14', '15-19') = 'Less than 19'; c('20-24', '25-29', '30-34', '35-39') = '20-39'; c('40-44', '45 or more') = '40 or more'")
table(dataset$credits_autumn)
20-39 40 or more Less than 19
1289 302 508
c.tab <- table(dataset$credits_autumn, dataset$q12_1)
c.tab
Strongly disagree Moderately disagree
20-39 643 211
40 or more 168 37
Less than 19 183 67
Slightly disagree Undecided
20-39 176 93
40 or more 41 18
Less than 19 75 51
Slightly agree Moderately agree
20-39 127 17
40 or more 25 4
Less than 19 84 22
Strongly agree
20-39 11
40 or more 4
Less than 19 16
chisq.test(b.tab)
Pearson's Chi-squared test
data: b.tab
X-squared = 18.411, df = 6, p-value = 0.005283
Based on the result, there is a significant difference
#d. q40 and q28. At first, categorize variable q40.
dataset$q40[dataset_test$q40 <= 10] <- '0-10'
dataset$q40[dataset_test$q40 > 10 & dataset_test$q40 <= 20] <- '11-20'
dataset$q40[dataset_test$q40 > 20 & dataset_test$q40 <= 40] <- '21-40'
d.tab <- table(dataset$q40, dataset$q28)
d.tab
Yes, constantly Yes, often Sometimes Not often
0-10 33 160 652 664
11-20 1 10 45 41
21-40 0 3 4 5
Not at all
0-10 392
11-20 34
21-40 3
chisq.test(d.tab)
Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: d.tab
X-squared = 5.8199, df = 8, p-value = 0.6674
Based on the result, there is no significant difference