Objectives: Students are able to use R language
to
1. perform descriptive statistics
2. construct scatterplot between two quantitative variables
3. perform correlation analysis
4. perform linear regression analysis and inference on regression
parameters
5. interpret the results
You can download data file in Mango CMU –> Lab Data.
Data file is SalePrice.csv
If the data file is in excel file .xls, we need to change it to
be .csv before importing it into R. To do this, you need to download
data file –> Open file –> Save as .csv
Import data into R
Then we go to R studio window, we can type in pur code in R script. Note
that your working directory (the place where the data file is at) will
be different from mine.
#set working directory
#Go to the data file property--> Copy the file location-->paste the location in setwd() command --> change \ to \\
setwd('C:\\Users\\ASUS\\Downloads')
#read .csv file in R
Mydata = read.csv('SalePrice.csv',header=TRUE)
Mydata #call to see data and variables
## Obs Area Sale.price
## 1 1 13 115
## 2 2 4 45
## 3 3 12 100
## 4 4 5 50
## 5 5 6 55
## 6 6 8 85
## 7 7 3 40
## 8 8 4 50
## 9 9 5 45
## 10 10 7 70
x = Mydata$Area #set x (independent variable)
y = Mydata$Sale.price #set y (dependent variable)
plot(x,y,xlab="Area",ylab="Sale price",main="Scatterplot of sale price and area")
cor(x,y)
## [1] 0.978902
cor.test(x, y,alternative = c("two.sided"),conf.level = 0.95)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 13.55, df = 8, p-value = 8.451e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9103840 0.9951655
## sample estimates:
## cor
## 0.978902
fit = lm(y~x) #model fitting
summary(fit) #some basic information about the model is output
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.738 -4.618 0.987 2.269 9.741
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.202 4.120 3.69 0.00613 **
## x 7.507 0.554 13.55 8.45e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.653 on 8 degrees of freedom
## Multiple R-squared: 0.9582, Adjusted R-squared: 0.953
## F-statistic: 183.6 on 1 and 8 DF, p-value: 8.451e-07
confint.lm(fit,level=0.95) #to obtain a confidence interval for the coefficient parameters,
## 2.5 % 97.5 %
## (Intercept) 5.701166 24.702293
## x 6.229628 8.784781
yhat = predict(fit) #compute fitted y
yhat
## 1 2 3 4 5 6 7 8
## 112.79539 45.23055 105.28818 52.73775 60.24496 75.25937 37.72334 45.23055
## 9 10
## 52.73775 67.75216
plot(x,y,xlab="Area",ylab="Sale price",main="Scatterplot of sale price and area")
lines(x,yhat,type="o",pch=20)
res = resid(fit)
res
## 1 2 3 4 5 6 7
## 2.2046110 -0.2305476 -5.2881844 -2.7377522 -5.2449568 9.7406340 2.2766571
## 8 9 10
## 4.7694524 -7.7377522 2.2478386
par(mfrow=c(1,2)) #set plot layout as 1 row 2 column
plot(res)
hist(res)
fit:the point or fitted estimate
lwr: the lower bound of the interval
upr: the upper bound of the interval
new = data.frame(x = 13)
pred = predict(fit, new)
pred
## 1
## 112.7954
pred.CI = predict(fit, newdata=new, interval = "confidence")
pred.CI
## fit lwr upr
## 1 112.7954 103.7525 121.8382
pred.PI = predict(fit, newdata=new, interval = "prediction")
pred.PI
## fit lwr upr
## 1 112.7954 96.93079 128.66
new = data.frame(x = sort(x))
new
## x
## 1 3
## 2 4
## 3 4
## 4 5
## 5 5
## 6 6
## 7 7
## 8 8
## 9 12
## 10 13
pred.CI = predict(fit, newdata=new, interval = "confidence")
pred.CI
## fit lwr upr
## 1 37.72334 31.45150 43.99519
## 2 45.23055 39.85561 50.60549
## 3 45.23055 39.85561 50.60549
## 4 52.73775 48.07854 57.39697
## 5 52.73775 48.07854 57.39697
## 6 60.24496 56.02702 64.46289
## 7 67.75216 63.61234 71.89198
## 8 75.25937 70.81531 79.70342
## 9 105.28818 97.36103 113.21534
## 10 112.79539 103.75253 121.83825
pred.PI = predict(fit, newdata=new, interval = "prediction")
pred.PI
## fit lwr upr
## 1 37.72334 23.25793 52.18875
## 2 45.23055 31.13083 59.33027
## 3 45.23055 31.13083 59.33027
## 4 52.73775 38.89505 66.58045
## 5 52.73775 38.89505 66.58045
## 6 60.24496 46.54448 73.94543
## 7 67.75216 54.07553 81.42879
## 8 75.25937 61.48759 89.03114
## 9 105.28818 90.03198 120.54439
## 10 112.79539 96.93079 128.65998
# create a plot
plot(x,y,xlab="Area",ylab="Sale price",main="Scatterplot of sale price and area")
lines(x,yhat,type="o",pch=20)
# 95% confidence interval
points(new$x,pred.CI[,"lwr"],col="red",type="o",pch=20)
points(new$x,pred.CI[,"upr"],col="red",type="o",pch=20)
# 95% prediction interval
points(new$x,pred.PI[,"lwr"],col="blue",type="o",pch=20)
points(new$x,pred.PI[,"upr"],col="blue",type="o",pch=20)
JobSatisfaction.csvsetwd("C:\\Users\\ASUS\\Downloads")
Mydata2=read.csv('JobSatisfaction.csv',header=TRUE)
str(Mydata2) #Check variables type
## 'data.frame': 20 obs. of 9 variables:
## $ Obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 33 27 22 34 24 37 25 31 40 26 ...
## $ Gender : chr "Male" "Male" "Female" "Male" ...
## $ Status : chr "Married" "Married" "Single" "Married" ...
## $ Education : chr "Undergrad" "Undergrad" "Master " "Undergrad" ...
## $ Smoke : chr "No" "Yes" "Yes" "No" ...
## $ Income : int 15050 20600 15200 22500 30400 33850 15100 34900 23200 32000 ...
## $ Life.Satisfaction: int 3 2 4 3 4 3 2 1 1 1 ...
## $ Job.Satisfaction : int 3 4 1 4 3 3 3 2 3 3 ...
# frequency table of gender
table1 <- table(Mydata2$Gender)
table1
##
## Female Male
## 9 11
prop.table(table1) #compute percentage
##
## Female Male
## 0.45 0.55
# percentages for gender by highest education
table2 <- table(Mydata2$Education, Mydata2$Gender)
table2
##
## Female Male
## High school 2 2
## Master 2 1
## Secondary school 1 1
## Undergrad 4 7
prop.table(table2)
##
## Female Male
## High school 0.10 0.10
## Master 0.10 0.05
## Secondary school 0.05 0.05
## Undergrad 0.20 0.35
# Descriptive statistics for quantitative data
summary(Mydata2$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.00 25.00 30.50 31.55 38.50 44.00
summary(Mydata2$Income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13500 17300 22850 24372 30800 39560
boxplot(Mydata2$Income)
# Descriptive between qualitative and quantitative
boxplot(Mydata2$Income~Mydata2$Gender,xlab="Gender",ylab="Income")
boxplot(Mydata2$Income~Mydata2$Job.Satisfaction,xlab="Job Satisfaction",ylab="Income")
x = Mydata2$Age #set x (independent variable)
y = Mydata2$Income #set y (dependent variable)
plot(x,y,xlab="Age",ylab="Income",main="Scatterplot of age and income")
cor(x,y)
## [1] 0.5424278
cor.test(x, y,alternative = c("two.sided"),conf.level = 0.95)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 2.7393, df = 18, p-value = 0.01348
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1314628 0.7942906
## sample estimates:
## cor
## 0.5424278
fit = lm(y~x) #model fitting
summary(fit) #some basic information about the model is output
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10957 -4321 -1207 5244 10833
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6893.1 6567.0 1.050 0.3078
## x 554.0 202.2 2.739 0.0135 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6950 on 18 degrees of freedom
## Multiple R-squared: 0.2942, Adjusted R-squared: 0.255
## F-statistic: 7.504 on 1 and 18 DF, p-value: 0.01348
confint.lm(fit) #to obtain a confidence interval for the coefficient estimates,
## 2.5 % 97.5 %
## (Intercept) -6903.6540 20689.9379
## x 129.1102 978.8681
yhat = predict(fit) #compute fitted y
yhat
## 1 2 3 4 5 6 7 8
## 25174.78 21850.85 19080.90 25728.77 20188.88 27390.74 20742.87 24066.81
## 9 10 11 12 13 14 15 16
## 29052.71 21296.86 29606.70 27944.73 22404.84 31268.67 18526.91 20742.87
## 17 18 19 20
## 23512.82 30160.69 30714.68 17972.93
plot(x,y,xlab="Age",ylab="Income",main="Linear fitted line")
lines(x,yhat,type="o",pch=20)
res = resid(fit)
res
## 1 2 3 4 5 6
## -10124.7843 -1250.8493 -3880.9035 -3228.7734 10211.1182 6459.2591
## 7 8 9 10 11 12
## -5642.8710 10833.1940 -5852.7084 10703.1398 -10956.6976 625.2699
## 13 14 15 16 17 18
## 3595.1615 8291.3349 -526.9143 -7242.8710 -1162.8168 4839.3133
## 19 20
## -2714.6759 -2972.9252
par(mfrow=c(1,2)) #set plot layout as 1 row 2 column
plot(res)
hist(res)
new2 = data.frame(x = sort(x))
new2
## x
## 1 20
## 2 21
## 3 22
## 4 24
## 5 25
## 6 25
## 7 26
## 8 27
## 9 28
## 10 30
## 11 31
## 12 33
## 13 34
## 14 37
## 15 38
## 16 40
## 17 41
## 18 42
## 19 43
## 20 44
pred.CI = predict(fit, newdata=new2, interval = "confidence",se.fit=TRUE)
pred.CI
## $fit
## fit lwr upr
## 1 17972.93 12078.73 23867.12
## 2 18526.91 12981.47 24072.36
## 3 19080.90 13872.89 24288.92
## 4 20188.88 15611.81 24765.95
## 5 20742.87 16452.86 25032.89
## 6 20742.87 16452.86 25032.89
## 7 21296.86 17269.46 25324.26
## 8 21850.85 18056.56 25645.14
## 9 22404.84 18808.39 26001.28
## 10 23512.82 20182.19 26843.45
## 11 24066.81 20793.58 27340.03
## 12 25174.78 21852.30 28497.27
## 13 25728.77 22301.97 29155.57
## 14 27390.74 23388.07 31393.41
## 15 27944.73 23682.15 32207.31
## 16 29052.71 24199.97 33905.45
## 17 29606.70 24431.71 34781.68
## 18 30160.69 24649.53 35671.84
## 19 30714.68 24855.81 36573.54
## 20 31268.67 25052.50 37484.83
##
## $se.fit
## 1 2 3 4 5 6 7 8
## 2805.526 2639.529 2478.920 2178.602 2041.968 2041.968 1916.966 1806.012
## 9 10 11 12 13 14 15 16
## 1711.841 1585.317 1557.994 1581.443 1631.094 1905.195 2028.907 2309.816
## 17 18 19 20
## 2463.197 2623.209 2788.711 2958.782
##
## $df
## [1] 18
##
## $residual.scale
## [1] 6949.78
pred.PI = predict(fit, newdata=new2, interval = "prediction",se.fit=TRUE)
pred.PI
## $fit
## fit lwr upr
## 1 17972.93 2227.160 33718.69
## 2 18526.91 2908.349 34145.48
## 3 19080.90 3578.933 34582.87
## 4 20188.88 4887.339 35490.43
## 5 20742.87 5524.728 35961.01
## 6 20742.87 5524.728 35961.01
## 7 21296.86 6150.654 36443.07
## 8 21850.85 6764.953 36936.75
## 9 22404.84 7367.484 37442.19
## 10 23512.82 8536.812 38488.82
## 11 24066.81 9103.463 39030.15
## 12 25174.78 10200.588 40148.98
## 13 25728.77 10731.087 40726.46
## 14 27390.74 12251.092 42530.39
## 15 27944.73 12734.300 43155.16
## 16 29052.71 13666.457 44438.96
## 17 29606.70 14115.794 45097.60
## 18 30160.69 14554.262 45767.11
## 19 30714.68 14982.101 46447.25
## 20 31268.67 15399.563 47137.77
##
## $se.fit
## 1 2 3 4 5 6 7 8
## 2805.526 2639.529 2478.920 2178.602 2041.968 2041.968 1916.966 1806.012
## 9 10 11 12 13 14 15 16
## 1711.841 1585.317 1557.994 1581.443 1631.094 1905.195 2028.907 2309.816
## 17 18 19 20
## 2463.197 2623.209 2788.711 2958.782
##
## $df
## [1] 18
##
## $residual.scale
## [1] 6949.78
# create a plot
plot(x,y,xlab="Age",ylab="Income",main="Linear fitted line with confidence and prediction interval",
ylim=c(5000,55000))
lines(x,yhat,type="o",pch=20)
# 95% confidence interval
points(new2$x,pred.CI$fit[,"lwr"],col="red",type="o",pch=20)
points(new2$x,pred.CI$fit[,"upr"],col="red",type="o",pch=20)
# 95% prediction interval
points(new2$x,pred.PI$fit[,"lwr"],col="blue",type="o",pch=20)
points(new2$x,pred.PI$fit[,"upr"],col="blue",type="o",pch=20)
A teacher from Statistics class wants to examine the relationship between final and midterm score of her students.
Data file: MidtermandFinal.csv
1.Find sample correlation between midterm and final scores and interpret its value.
2.Test the population correlation whether midterm and final scores are correlated at significance level 0.05 (Perform the 4-step process (state hypotheses, give a test statistic and P-value, and state your conclusion).
3.Find the 95% confidence interval of the population correlation.
4.Make a scatterplot between midterm and final scores. From the scatterplot, describe the form, direction, and strength of the relationship. Is the relationship approximately linear?
5.Obtain the least-squares regression line for predicting final score from midterm score.
6.Find the value of MSres
7.From the regression line, test whether midterm score can be used to described final score at significance level 0.05. (Perform the 4-step process (state hypotheses, give a test statistic and P-value, and state your conclusion)
8.Find the 95% confidence interval of regression parameters
9.Find \(r^2\) and interpret its value for these data
10.Find 95% confidence and prediction interval of the final score for the midterm score of 75.
11.Plot linear fitted line and their 95% confidence interval and prediction interval for all values of x.
Objectives: Students are able to use R language to
analyse data using multiple linear regression:
1. perform descriptive statistics for multiple linear regression
2. select independent variables into the model
3. perform multiple linear regression analysis and make an inference on
regression parameters
4. interpret the results
A farm owner wants to find a way to reduce the cost of farming. He discovered that one of the major costs was the amount of water used and the monthly water bill. Therefore, he then collected data on the amount of water used and other variables that may be related to water consumption for 12 months.
Farming.csvsetwd("C:\\Users\\ASUS\\Downloads")
dt = read.csv("Farming.csv",header=TRUE)
str(dt)
## 'data.frame': 12 obs. of 7 variables:
## $ Month: int 1 2 3 4 5 6 7 8 9 10 ...
## $ x1 : num 58.8 65.2 70.9 77.4 79.3 81 71.9 63.9 54.5 39.5 ...
## $ x2 : int 7107 6373 6796 9208 14792 14564 11964 13526 12656 14119 ...
## $ x3 : int 21 22 22 20 25 23 20 23 20 20 ...
## $ x4 : int 129 141 153 166 193 189 175 186 190 187 ...
## $ x5 : int 52 68 29 23 40 14 96 94 54 37 ...
## $ y : int 3067 2828 2891 2994 3082 3898 3502 3060 3211 3286 ...
cor(dt[,2:7]) #get the r for each pair of variables
## x1 x2 x3 x4 x5 y
## x1 1.00000000 -0.3045404 0.524480986 -0.266266512 0.01497596 0.01204283
## x2 -0.30454037 1.0000000 0.129394604 0.931227310 -0.11338927 0.64468132
## x3 0.52448099 0.1293946 1.000000000 0.004058349 0.04524099 0.03874040
## x4 -0.26626651 0.9312273 0.004058349 1.000000000 -0.18548584 0.48884617
## x5 0.01497596 -0.1133893 0.045240990 -0.185485838 1.00000000 -0.13955958
## y 0.01204283 0.6446813 0.038740404 0.488846172 -0.13955958 1.00000000
pairs(dt[,2:7])
fit <- lm(y~x1+x2+x3+x4+x5,data=dt)
summary(fit)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -254.68 -76.26 15.92 82.48 283.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5687.53021 1446.84710 3.931 0.0077 **
## x1 12.78417 6.06857 2.107 0.0797 .
## x2 0.19539 0.05862 3.333 0.0157 *
## x3 -99.22265 51.43749 -1.929 0.1020
## x4 -19.31076 8.36183 -2.309 0.0603 .
## x5 -1.62894 2.44713 -0.666 0.5304
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 210.9 on 6 degrees of freedom
## Multiple R-squared: 0.7423, Adjusted R-squared: 0.5276
## F-statistic: 3.457 on 5 and 6 DF, p-value: 0.08149
anova(fit)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x1 1 150 150 0.0034 0.95555
## x2 1 479735 479735 10.7876 0.01673 *
## x3 1 49648 49648 1.1164 0.33137
## x4 1 219350 219350 4.9324 0.06811 .
## x5 1 19705 19705 0.4431 0.53038
## Residuals 6 266825 44471
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Forward selection
null=lm(y~1,data=dt)
full=lm(y~x1+x2+x3+x4+x5,data=dt)
fw.fit = step(null,scope=list(lower=null,upper=full),direction="forward")
## Start: AIC=138.38
## y ~ 1
##
## Df Sum of Sq RSS AIC
## + x2 1 430332 605080 133.94
## + x4 1 247433 787979 137.11
## <none> 1035412 138.38
## + x5 1 20167 1015245 140.15
## + x3 1 1554 1033858 140.37
## + x1 1 150 1035262 140.38
##
## Step: AIC=133.94
## y ~ x2
##
## Df Sum of Sq RSS AIC
## + x4 1 96918 508162 133.84
## <none> 605080 133.94
## + x1 1 49553 555527 134.91
## + x5 1 4633 600447 135.85
## + x3 1 2102 602978 135.90
##
## Step: AIC=133.84
## y ~ x2 + x4
##
## Df Sum of Sq RSS AIC
## <none> 508162 133.84
## + x1 1 56856 451306 134.42
## + x3 1 23836 484326 135.27
## + x5 1 19660 488503 135.37
fw.fit
##
## Call:
## lm(formula = y ~ x2 + x4, data = dt)
##
## Coefficients:
## (Intercept) x2 x4
## 3661.9811 0.1225 -10.8491
# Backward elimination
be.fit = step(full,direction="backward")
## Start: AIC=132.11
## y ~ x1 + x2 + x3 + x4 + x5
##
## Df Sum of Sq RSS AIC
## - x5 1 19705 286530 130.97
## <none> 266825 132.11
## - x3 1 165477 432302 135.90
## - x1 1 197354 464179 136.76
## - x4 1 237176 504001 137.75
## - x2 1 494115 760940 142.69
##
## Step: AIC=130.97
## y ~ x1 + x2 + x3 + x4
##
## Df Sum of Sq RSS AIC
## <none> 286530 130.97
## - x3 1 164777 451306 134.42
## - x1 1 197797 484326 135.27
## - x4 1 219350 505879 135.79
## - x2 1 476101 762630 140.72
be.fit
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = dt)
##
## Coefficients:
## (Intercept) x1 x2 x3 x4
## 5473.4484 12.7984 0.1899 -99.0107 -18.1916
# Stepwise regression
sw.fit = step(full, direction="both")
## Start: AIC=132.11
## y ~ x1 + x2 + x3 + x4 + x5
##
## Df Sum of Sq RSS AIC
## - x5 1 19705 286530 130.97
## <none> 266825 132.11
## - x3 1 165477 432302 135.90
## - x1 1 197354 464179 136.76
## - x4 1 237176 504001 137.75
## - x2 1 494115 760940 142.69
##
## Step: AIC=130.97
## y ~ x1 + x2 + x3 + x4
##
## Df Sum of Sq RSS AIC
## <none> 286530 130.97
## + x5 1 19705 266825 132.11
## - x3 1 164777 451306 134.42
## - x1 1 197797 484326 135.27
## - x4 1 219350 505879 135.79
## - x2 1 476101 762630 140.72
sw.fit
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = dt)
##
## Coefficients:
## (Intercept) x1 x2 x3 x4
## 5473.4484 12.7984 0.1899 -99.0107 -18.1916
Objectives: Students are able to use R language to
analyse data using multiple linear regression:
1. perform descriptive statistics
2. transform qualitative independent variable into dummy variables
3. select independent variables into the model
4. perform multiple linear regression analysis and make an inference on
regression parameters
5. perform model diagnostics (check residuals assumptions)
6. interpret the results
HousePrices.csvNote that your working directory (the place where the data file is at) will be different from mine.
setwd("C:\\Users\\ASUS\\Downloads")
data = read.csv("HousePrices.csv",header=TRUE)
data
## Store NumberOfHousehold Location SalesPrice
## 1 1 161 Street 157.27
## 2 2 99 Street 93.28
## 3 3 135 Street 136.81
## 4 4 120 Street 123.79
## 5 5 164 Street 153.51
## 6 6 221 Mall 241.74
## 7 7 179 Mall 204.54
## 8 8 204 Mall 206.71
## 9 9 214 Mall 229.78
## 10 10 101 Mall 135.22
## 11 11 231 Downtown 224.71
## 12 12 206 Downtown 195.29
## 13 13 248 Downtown 242.16
## 14 14 107 Downtown 115.21
## 15 15 205 Downtown 197.82
# rename the variables for convenience
y = data$SalesPrice
x1 = data$NumberOfHousehold #quantitative independent variable
x2 = data$Location #qualitative independent variable with k=3 groups
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 93.28 136.01 195.29 177.19 215.71 242.16
summary(x1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 99.0 127.5 179.0 173.0 210.0 248.0
plot(x1,y,xlab="Number Of Household",ylab="Sale price")
cor(x1,y) #Compute the r between two quantitative variables
## [1] 0.9610064
boxplot(y~x2,xlab="Location",ylab="Sale price")
NOTE: It is easier to work with graphs when using ggplot
data_df = data.frame(data) # set data as dataframe type
library(ggplot2) # you might need to install packgage "ggplot2" first!
## Warning: package 'ggplot2' was built under R version 4.3.3
ggplot(data_df,aes(x=NumberOfHousehold,y=SalesPrice))+
geom_point(aes(color=Location))
ggplot(data_df,aes(y=SalesPrice,x=Location))+
geom_boxplot(aes(color=Location))
#create x2.dummy
x2.dummy= ifelse(data$Location=="Street",1,0)
x2.dummy
## [1] 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
#create x3.dummy
x3.dummy= ifelse(data$Location=="Mall",1,0)
x3.dummy
## [1] 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0
# Forward selection
null=lm(y~1)
full=lm(y~x1+x2.dummy+x3.dummy)
fw.fit = step(null, scope=list(lower=null,upper=full),direction="forward")
## Start: AIC=117.83
## y ~ 1
##
## Df Sum of Sq RSS AIC
## + x1 1 31278.1 2590 81.269
## + x2.dummy 1 14690.3 19178 111.302
## + x3.dummy 1 5230.6 28637 117.316
## <none> 33868 117.833
##
## Step: AIC=81.27
## y ~ x1
##
## Df Sum of Sq RSS AIC
## + x3.dummy 1 2038.47 551.29 60.063
## + x2.dummy 1 931.21 1658.56 76.585
## <none> 2589.77 81.269
##
## Step: AIC=60.06
## y ~ x1 + x3.dummy
##
## Df Sum of Sq RSS AIC
## + x2.dummy 1 84.366 466.92 59.572
## <none> 551.29 60.063
##
## Step: AIC=59.57
## y ~ x1 + x3.dummy + x2.dummy
fw.fit
##
## Call:
## lm(formula = y ~ x1 + x3.dummy + x2.dummy)
##
## Coefficients:
## (Intercept) x1 x3.dummy x2.dummy
## 21.958 0.868 22.101 -6.901
# Backward elimination
be.fit = step(full,direction="backward")
## Start: AIC=59.57
## y ~ x1 + x2.dummy + x3.dummy
##
## Df Sum of Sq RSS AIC
## <none> 466.9 59.572
## - x2.dummy 1 84.4 551.3 60.063
## - x3.dummy 1 1191.6 1658.6 76.585
## - x1 1 18527.4 18994.3 113.158
be.fit
##
## Call:
## lm(formula = y ~ x1 + x2.dummy + x3.dummy)
##
## Coefficients:
## (Intercept) x1 x2.dummy x3.dummy
## 21.958 0.868 -6.901 22.101
# Stepwise regression
sw.fit = step(full, direction="both")
## Start: AIC=59.57
## y ~ x1 + x2.dummy + x3.dummy
##
## Df Sum of Sq RSS AIC
## <none> 466.9 59.572
## - x2.dummy 1 84.4 551.3 60.063
## - x3.dummy 1 1191.6 1658.6 76.585
## - x1 1 18527.4 18994.3 113.158
sw.fit
##
## Call:
## lm(formula = y ~ x1 + x2.dummy + x3.dummy)
##
## Coefficients:
## (Intercept) x1 x2.dummy x3.dummy
## 21.958 0.868 -6.901 22.101
summary(sw.fit)
##
## Call:
## lm(formula = y ~ x1 + x2.dummy + x3.dummy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.422 -2.989 2.243 4.572 5.852
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.95824 8.78193 2.500 0.029486 *
## x1 0.86800 0.04155 20.892 3.34e-10 ***
## x2.dummy -6.90102 4.89503 -1.410 0.186240
## x3.dummy 22.10084 4.17123 5.298 0.000253 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.515 on 11 degrees of freedom
## Multiple R-squared: 0.9862, Adjusted R-squared: 0.9825
## F-statistic: 262.3 on 3 and 11 DF, p-value: 1.641e-10
anova(sw.fit)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x1 1 31278.1 31278.1 736.863 1.976e-11 ***
## x2.dummy 1 931.2 931.2 21.938 0.0006675 ***
## x3.dummy 1 1191.6 1191.6 28.073 0.0002530 ***
## Residuals 11 466.9 42.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
confint.lm(sw.fit)
## 2.5 % 97.5 %
## (Intercept) 2.6293467 41.2871246
## x1 0.7765583 0.9594473
## x2.dummy -17.6749031 3.8728631
## x3.dummy 12.9200368 31.2816515
new = data.frame(x1=x1,x2.dummy=x2.dummy,x3.dummy=x3.dummy)
yhat = predict(sw.fit,newdata=new) #compute fitted y
yhat
## 1 2 3 4 5 6 7 8
## 154.8057 100.9895 132.2376 119.2176 157.4097 235.8877 199.4316 221.1317
## 9 10 11 12 13 14 15
## 229.8117 131.7274 222.4669 200.7668 237.2229 114.8345 199.8988
plot(x1,y,pch=1,xlab="Number Of Household",ylab="Sale price")
points(x1,yhat,type="p",pch=20)
legend("topleft",c("observed y","predicted y"),pch=c(1,20))
par(mfrow=c(2,2)) #set plot layout as 2 row 2 column
plot(fw.fit)
# Normality check by shapito wilk test
par(mfrow=c(1,2)) #set plot layout as 1 row 2 column
res = resid(fw.fit)
hist(res,main="Histogram of Residuals")
qqnorm(res)
qqline(res)
shapiro.test(res)
##
## Shapiro-Wilk normality test
##
## data: res
## W = 0.87123, p-value = 0.03518
#constant variance check by Breusch-Pagan test (bp test)
#install.packages("lmtest") you need to install the package first!
library("lmtest")
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(fw.fit)
##
## studentized Breusch-Pagan test
##
## data: fw.fit
## BP = 2.2962, df = 3, p-value = 0.5132
# durbin watson test for autocorrelation test
dwtest(fw.fit)
##
## Durbin-Watson test
##
## data: fw.fit
## DW = 2.611, p-value = 0.7911
## alternative hypothesis: true autocorrelation is greater than 0
Data file: ScoreAndEQ.csv
Objectives: Students are able to
1)perform descriptive statistics
2)apply appropriate non-parametric statistics tests to answer reseach
questions of interest.
Import data
setwd("C:\\Users\\ASUS\\Downloads")
dt <- read.csv("AStore_data.csv",header=TRUE)
head(dt)#To view first 6 rows of the data set
## Customer MethodOfPayment ItemsPerchased DiscountAmount Sales Gender
## 1 1 Cash 1 0 39.5 male
## 2 2 Cash 1 25 102.4 female
## 3 3 Cash 1 0 22.5 male
## 4 4 Cash 5 12 100.4 male
## 5 5 Cash 2 0 54.0 male
## 6 6 Cash 1 0 39.5 female
## MaritalStatus Age
## 1 single 32
## 2 married 36
## 3 divorced 32
## 4 widowed 28
## 5 single 34
## 6 single 44
str(dt) #To view type of variable in the data
## 'data.frame': 30 obs. of 8 variables:
## $ Customer : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MethodOfPayment: chr "Cash" "Cash" "Cash" "Cash" ...
## $ ItemsPerchased : int 1 1 1 5 2 1 9 3 4 4 ...
## $ DiscountAmount : int 0 25 0 12 0 0 11 34 18 4 ...
## $ Sales : num 39.5 102.4 22.5 100.4 54 ...
## $ Gender : chr "male" "female" "male" "male" ...
## $ MaritalStatus : chr "single" "married" "divorced" "widowed" ...
## $ Age : int 32 36 32 28 34 44 30 52 30 34 ...
Method of payment, Gender, Marital Status are qualitative variables. Then these variables need to be changed to be as.factor
dt$MethodOfPayment= as.factor(dt$MethodOfPayment)
dt$Gender = as.factor(dt$Gender)
dt$MaritalStatus = as.factor(dt$MaritalStatus)
summary(dt$Age) #calculate summary statistics of Age
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.00 31.25 38.00 40.27 49.75 68.00
par(mfrow=c(1,2))#create plots layout with 1 row and 2 columns
boxplot(dt$Age) #create boxplot of age
hist(dt$Age) #create histogram of age
#perform Wilcoxon test for one sample
wilcox.test(x=dt$Age, mu=45, alternative = "two.sided")
## Warning in wilcox.test.default(x = dt$Age, mu = 45, alternative = "two.sided"):
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = dt$Age, mu = 45, alternative = "two.sided"):
## cannot compute exact p-value with zeroes
##
## Wilcoxon signed rank test with continuity correction
##
## data: dt$Age
## V = 116.5, p-value = 0.04996
## alternative hypothesis: true location is not equal to 45
summary(dt$ItemsPerchased)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.250 8.000 7.967 10.750 20.000
par(mfrow=c(1,2)) #to set layout of plots to be 1 row 2 columns
boxplot(dt$ItemsPerchased,ylab="Items perchased")
hist(dt$ItemsPerchased,xlab="Items perchased")
#perform .........for testing mean of one sample
wilcox.test(x=dt$ItemsPerchased,mu=7,alternative = "less")
## Warning in wilcox.test.default(x = dt$ItemsPerchased, mu = 7, alternative =
## "less"): cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = dt$ItemsPerchased, mu = 7, alternative =
## "less"): cannot compute exact p-value with zeroes
##
## Wilcoxon signed rank test with continuity correction
##
## data: dt$ItemsPerchased
## V = 225, p-value = 0.6964
## alternative hypothesis: true location is less than 7
setwd("C:\\Users\\ASUS\\Downloads")
dt2 <- read.csv("Music_data.csv",header = TRUE)
head(dt2)#To view first 6 rows of the data set
## Runner Type Scale
## 1 1 None 8
## 2 2 None 7
## 3 3 None 6
## 4 4 None 8
## 5 5 None 5
## 6 6 None 9
str(dt2) #To view type of variable in the data
## 'data.frame': 36 obs. of 3 variables:
## $ Runner: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Type : chr "None" "None" "None" "None" ...
## $ Scale : int 8 7 6 8 5 9 7 8 8 7 ...
#Type is qualitative variable –> set as/factor
dt2$Type = as.factor(dt2$Type)
Explore data
#find summary statistics of Scale for each Type
tapply(dt2$Scale, dt2$Type, summary)
## $Classical
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.000 6.750 7.500 7.417 8.000 9.000
##
## $Dance
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.0 6.0 6.5 6.5 7.0 8.0
##
## $None
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.000 7.000 7.500 7.417 8.000 9.000
#create boxplot of Scale by Type
par(mfrow=c(1,1))
boxplot(dt2$Scale ~ dt2$Type,xlab="Type of music",ylab="Scale")
A) Is there a difference between None to Classical ?
wilcox.test(x=dt2$Scale[dt2$Type=="None"], y=dt2$Scale[dt2$Type=="Classical"],
paired = TRUE, alternative = "two.sided")
## Warning in wilcox.test.default(x = dt2$Scale[dt2$Type == "None"], y =
## dt2$Scale[dt2$Type == : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = dt2$Scale[dt2$Type == "None"], y =
## dt2$Scale[dt2$Type == : cannot compute exact p-value with zeroes
##
## Wilcoxon signed rank test with continuity correction
##
## data: dt2$Scale[dt2$Type == "None"] and dt2$Scale[dt2$Type == "Classical"]
## V = 23, p-value = 1
## alternative hypothesis: true location shift is not equal to 0
#B) Is the Classical higher than Dance?
wilcox.test(x=dt2$Scale[dt2$Type=="Classical"], y=dt2$Scale[dt2$Type=="Dance"],
paired = TRUE, alternative = "greater")
## Warning in wilcox.test.default(x = dt2$Scale[dt2$Type == "Classical"], y =
## dt2$Scale[dt2$Type == : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = dt2$Scale[dt2$Type == "Classical"], y =
## dt2$Scale[dt2$Type == : cannot compute exact p-value with zeroes
##
## Wilcoxon signed rank test with continuity correction
##
## data: dt2$Scale[dt2$Type == "Classical"] and dt2$Scale[dt2$Type == "Dance"]
## V = 24.5, p-value = 0.04231
## alternative hypothesis: true location shift is greater than 0
Objectives: Students are able to
1)perform descriptive statistics
2)apply appropriate non-parametric statistics tests to answer reseach
questions of interest.
Import data
setwd("C:\\Users\\ASUS\\Downloads")
dt <- read.csv("AStore_data.csv",header=TRUE)
#look at data structure
str(dt)
## 'data.frame': 30 obs. of 8 variables:
## $ Customer : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MethodOfPayment: chr "Cash" "Cash" "Cash" "Cash" ...
## $ ItemsPerchased : int 1 1 1 5 2 1 9 3 4 4 ...
## $ DiscountAmount : int 0 25 0 12 0 0 11 34 18 4 ...
## $ Sales : num 39.5 102.4 22.5 100.4 54 ...
## $ Gender : chr "male" "female" "male" "male" ...
## $ MaritalStatus : chr "single" "married" "divorced" "widowed" ...
## $ Age : int 32 36 32 28 34 44 30 52 30 34 ...
Change qualitative variables to as.factor
dt$MethodOfPayment <- as.factor(dt$MethodOfPayment)
dt$Gender <- as.factor(dt$Gender)
dt$MaritalStatus <- as.factor(dt$MaritalStatus)
check data structure again
str(dt)
## 'data.frame': 30 obs. of 8 variables:
## $ Customer : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MethodOfPayment: Factor w/ 3 levels "Cash","Credit card",..: 1 1 1 1 1 1 1 1 1 3 ...
## $ ItemsPerchased : int 1 1 1 5 2 1 9 3 4 4 ...
## $ DiscountAmount : int 0 25 0 12 0 0 11 34 18 4 ...
## $ Sales : num 39.5 102.4 22.5 100.4 54 ...
## $ Gender : Factor w/ 2 levels "female","male": 2 1 2 2 2 1 2 1 1 1 ...
## $ MaritalStatus : Factor w/ 4 levels "divorced","married",..: 3 2 1 4 3 3 3 2 2 3 ...
## $ Age : int 32 36 32 28 34 44 30 52 30 34 ...
tapply(dt$Sales,dt$Gender,summary)
## $female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.0 78.5 150.0 171.0 276.2 357.0
##
## $male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.0 31.0 57.0 84.4 121.2 253.0
boxplot(dt$Sales~dt$Gender,xlab="Gender",ylab="Sales")
#perform Mann-Whitney
wilcox.test(x=dt$Sales[dt$Gender=="female"],
y=dt$Sales[dt$Gender=="male"],
paired=FALSE,
alternative = "two.sided")
## Warning in wilcox.test.default(x = dt$Sales[dt$Gender == "female"], y =
## dt$Sales[dt$Gender == : cannot compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: dt$Sales[dt$Gender == "female"] and dt$Sales[dt$Gender == "male"]
## W = 158.5, p-value = 0.02128
## alternative hypothesis: true location shift is not equal to 0
tapply(dt$DiscountAmount,dt$MethodOfPayment,summary)
## $Cash
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 11.00 11.11 18.00 34.00
##
## $`Credit card`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 15.75 34.00 29.70 41.50 49.00
##
## $`Debit card`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 6 27 25 41 48
boxplot(dt$DiscountAmount~dt$MethodOfPayment)
#perform Kruskal wallis
kruskal.test(dt$DiscountAmount~dt$MethodOfPayment)
##
## Kruskal-Wallis rank sum test
##
## data: dt$DiscountAmount by dt$MethodOfPayment
## Kruskal-Wallis chi-squared = 7.1831, df = 2, p-value = 0.02756
plot(dt$ItemsPerchased,dt$DiscountAmount,
xlab="Items perchased",ylab="Discount amount")
#perform Spearman rank test
cor.test(~dt$ItemsPerchased+dt$DiscountAmount,
method="spearman",alternative="two.sided")
## Warning in cor.test.default(x = mf[[1L]], y = mf[[2L]], ...): Cannot compute
## exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: dt$ItemsPerchased and dt$DiscountAmount
## S = 2674.5, p-value = 0.0264
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.4050143
plot(dt$Age,dt$Sales)
cor.test(~dt$Age+dt$Sales,method="spearman",alternative="greater")
## Warning in cor.test.default(x = mf[[1L]], y = mf[[2L]], ...): Cannot compute
## exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: dt$Age and dt$Sales
## S = 4126, p-value = 0.3331
## alternative hypothesis: true rho is greater than 0
## sample estimates:
## rho
## 0.08208958
boxplot(dt$Sales~dt$MaritalStatus)
#perform Kruskal -Wallis
kruskal.test(dt$Sales~dt$MaritalStatus)
##
## Kruskal-Wallis rank sum test
##
## data: dt$Sales by dt$MaritalStatus
## Kruskal-Wallis chi-squared = 2.4721, df = 3, p-value = 0.4804
table(dt$Gender,dt$MethodOfPayment)
##
## Cash Credit card Debit card
## female 4 7 8
## male 5 3 3
chisq.test(dt$Gender,dt$MethodOfPayment)
## Warning in chisq.test(dt$Gender, dt$MethodOfPayment): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: dt$Gender and dt$MethodOfPayment
## X-squared = 1.9922, df = 2, p-value = 0.3693
library(readr)
setwd("C:\\Users\\ASUS\\Downloads")
dt2 <- read.csv("Music_data.csv",header = TRUE)
Type is qualitative variable–>as.factor
dt2$Type <- as.factor(dt2$Type)
str(dt2)
## 'data.frame': 36 obs. of 3 variables:
## $ Runner: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Type : Factor w/ 3 levels "Classical","Dance",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Scale : int 8 7 6 8 5 9 7 8 8 7 ...
Can we conclude that scales felt while running are affected by the types of music?
tapply(dt2$Scale, dt2$Type,summary)
## $Classical
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.000 6.750 7.500 7.417 8.000 9.000
##
## $Dance
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.0 6.0 6.5 6.5 7.0 8.0
##
## $None
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.000 7.000 7.500 7.417 8.000 9.000
boxplot(dt2$Scale~dt2$Type,xlab="Type",ylab="Scale")
friedman.test(y=dt2$Scale,group=dt2$Type,blocks = dt2$Runner)
##
## Friedman rank sum test
##
## data: dt2$Scale, dt2$Type and dt2$Runner
## Friedman chi-squared = 7.6, df = 2, p-value = 0.02237