d1=read.table("C:/Users/david/Downloads/student+performance/student/student-mat.csv", sep=";",header=TRUE)
d2=read.table("C:/Users/david/Downloads/student+performance/student/student-por.csv", sep=";",header=TRUE)
View(d1)
View(d2)
library(dplyr)
##
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## 载入需要的程辑包:carData
##
## 载入程辑包:'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## 载入程辑包:'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
library(MASS)
##
## 载入程辑包:'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(leaps)
library(ggplot2)
##
## 载入程辑包:'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(caret)
## 载入需要的程辑包:lattice
#Mathematic data
d11 <- d1[,-c(31,32)]
m1 <- lm(G3~school+sex+age+address+famsize+Pstatus+Medu+Fedu+Mjob+Fjob+reason+guardian+traveltime+studytime+failures+schoolsup+paid+activities+nursery+higher+internet+romantic+famrel+freetime+goout+Dalc+Walc+health+absences, data=d11)
cooksD <- cooks.distance(m1)
influential <- cooksD[(cooksD>(3*mean(cooksD, na.rm=TRUE)))]
names_of_influential <- names(influential)
df <- d1[names_of_influential,]
d1_clean <- d1 %>% anti_join(df)
## Joining with `by = join_by(school, sex, age, address, famsize, Pstatus, Medu,
## Fedu, Mjob, Fjob, reason, guardian, traveltime, studytime, failures, schoolsup,
## famsup, paid, activities, nursery, higher, internet, romantic, famrel,
## freetime, goout, Dalc, Walc, health, absences, G1, G2, G3)`
m1c <- lm(G3~school+sex+age+address+famsize+Pstatus+Medu+Fedu+Mjob+Fjob+reason+guardian+traveltime+studytime+failures+schoolsup+paid+activities+nursery+higher+internet+romantic+famrel+freetime+goout+Dalc+Walc+health+absences, data=d1_clean)
plot(m1c)




hist(d1_clean$G3, main="G3 Grades for Mathematics", xlab="G3")

#Stepwise selection
selectedMod <- step(m1c)
## Start: AIC=828.97
## G3 ~ school + sex + age + address + famsize + Pstatus + Medu +
## Fedu + Mjob + Fjob + reason + guardian + traveltime + studytime +
## failures + schoolsup + paid + activities + nursery + higher +
## internet + romantic + famrel + freetime + goout + Dalc +
## Walc + health + absences
##
## Df Sum of Sq RSS AIC
## - reason 3 1.82 2936.7 823.19
## - guardian 2 15.66 2950.5 826.87
## - Walc 1 0.03 2934.9 826.98
## - Pstatus 1 0.26 2935.1 827.01
## - freetime 1 0.42 2935.3 827.03
## - Fedu 1 0.62 2935.5 827.05
## - nursery 1 0.74 2935.6 827.06
## - absences 1 1.78 2936.6 827.19
## - school 1 1.79 2936.6 827.19
## - activities 1 2.42 2937.3 827.27
## - Medu 1 3.34 2938.2 827.38
## - traveltime 1 3.70 2938.5 827.42
## - famrel 1 4.18 2939.0 827.48
## - internet 1 5.26 2940.1 827.61
## - Dalc 1 5.77 2940.6 827.67
## - address 1 10.26 2945.1 828.22
## - romantic 1 13.24 2948.1 828.58
## - famsize 1 15.65 2950.5 828.87
## - paid 1 15.75 2950.6 828.88
## <none> 2934.8 828.97
## - higher 1 31.92 2966.8 830.82
## - age 1 35.20 2970.0 831.22
## - Fjob 4 85.75 3020.6 831.23
## - health 1 41.70 2976.5 832.00
## - sex 1 61.12 2996.0 834.31
## - studytime 1 73.98 3008.8 835.84
## - goout 1 75.68 3010.5 836.04
## - Mjob 4 176.98 3111.8 841.82
## - schoolsup 1 140.07 3074.9 843.57
## - failures 1 524.63 3459.5 885.52
##
## Step: AIC=823.19
## G3 ~ school + sex + age + address + famsize + Pstatus + Medu +
## Fedu + Mjob + Fjob + guardian + traveltime + studytime +
## failures + schoolsup + paid + activities + nursery + higher +
## internet + romantic + famrel + freetime + goout + Dalc +
## Walc + health + absences
##
## Df Sum of Sq RSS AIC
## - guardian 2 16.29 2952.9 821.16
## - Walc 1 0.03 2936.7 821.20
## - Pstatus 1 0.24 2936.9 821.22
## - Fedu 1 0.46 2937.1 821.25
## - freetime 1 0.48 2937.1 821.25
## - nursery 1 0.78 2937.4 821.29
## - absences 1 1.74 2938.4 821.40
## - school 1 2.08 2938.7 821.45
## - activities 1 2.09 2938.7 821.45
## - Medu 1 3.45 2940.1 821.61
## - traveltime 1 3.85 2940.5 821.66
## - famrel 1 4.34 2941.0 821.72
## - internet 1 5.19 2941.8 821.82
## - Dalc 1 6.69 2943.3 822.00
## - address 1 10.05 2946.7 822.41
## - romantic 1 12.45 2949.1 822.70
## - famsize 1 15.37 2952.0 823.05
## - paid 1 15.40 2952.1 823.06
## <none> 2936.7 823.19
## - higher 1 30.54 2967.2 824.88
## - age 1 36.11 2972.8 825.55
## - Fjob 4 90.11 3026.8 825.95
## - health 1 44.05 2980.7 826.49
## - sex 1 60.09 2996.7 828.40
## - studytime 1 74.47 3011.1 830.11
## - goout 1 77.80 3014.5 830.50
## - Mjob 4 188.51 3125.2 837.34
## - schoolsup 1 141.19 3077.8 837.91
## - failures 1 530.28 3466.9 880.29
##
## Step: AIC=821.16
## G3 ~ school + sex + age + address + famsize + Pstatus + Medu +
## Fedu + Mjob + Fjob + traveltime + studytime + failures +
## schoolsup + paid + activities + nursery + higher + internet +
## romantic + famrel + freetime + goout + Dalc + Walc + health +
## absences
##
## Df Sum of Sq RSS AIC
## - Walc 1 0.10 2953.0 819.17
## - Pstatus 1 0.45 2953.4 819.22
## - Fedu 1 1.29 2954.2 819.32
## - freetime 1 1.34 2954.3 819.32
## - nursery 1 1.52 2954.5 819.35
## - school 1 1.81 2954.7 819.38
## - activities 1 2.19 2955.1 819.43
## - traveltime 1 2.28 2955.2 819.44
## - Medu 1 2.40 2955.3 819.45
## - absences 1 3.43 2956.4 819.58
## - internet 1 5.10 2958.0 819.78
## - famrel 1 5.49 2958.4 819.82
## - Dalc 1 6.87 2959.8 819.99
## - romantic 1 11.94 2964.9 820.60
## - address 1 12.92 2965.9 820.72
## - famsize 1 13.80 2966.7 820.82
## - paid 1 14.46 2967.4 820.90
## <none> 2952.9 821.16
## - age 1 26.23 2979.2 822.31
## - higher 1 37.68 2990.6 823.68
## - Fjob 4 88.73 3041.7 823.70
## - health 1 46.83 2999.8 824.76
## - sex 1 61.73 3014.7 826.53
## - studytime 1 80.54 3033.5 828.74
## - goout 1 90.09 3043.0 829.86
## - Mjob 4 187.38 3140.3 835.07
## - schoolsup 1 139.17 3092.1 835.56
## - failures 1 521.00 3473.9 877.01
##
## Step: AIC=819.17
## G3 ~ school + sex + age + address + famsize + Pstatus + Medu +
## Fedu + Mjob + Fjob + traveltime + studytime + failures +
## schoolsup + paid + activities + nursery + higher + internet +
## romantic + famrel + freetime + goout + Dalc + health + absences
##
## Df Sum of Sq RSS AIC
## - Pstatus 1 0.46 2953.5 817.23
## - Fedu 1 1.24 2954.3 817.32
## - freetime 1 1.38 2954.4 817.34
## - nursery 1 1.46 2954.5 817.35
## - school 1 1.85 2954.9 817.40
## - activities 1 2.17 2955.2 817.44
## - traveltime 1 2.27 2955.3 817.45
## - Medu 1 2.53 2955.6 817.48
## - absences 1 3.36 2956.4 817.58
## - internet 1 5.12 2958.2 817.79
## - famrel 1 5.91 2958.9 817.89
## - Dalc 1 8.66 2961.7 818.22
## - romantic 1 11.93 2965.0 818.61
## - address 1 13.03 2966.1 818.74
## - famsize 1 13.71 2966.8 818.82
## - paid 1 14.72 2967.8 818.94
## <none> 2953.0 819.17
## - age 1 26.34 2979.4 820.34
## - higher 1 37.72 2990.8 821.69
## - Fjob 4 92.14 3045.2 822.11
## - health 1 47.38 3000.4 822.84
## - sex 1 61.94 3015.0 824.56
## - studytime 1 82.21 3035.2 826.95
## - goout 1 106.71 3059.7 829.81
## - Mjob 4 187.59 3140.6 833.10
## - schoolsup 1 139.07 3092.1 833.56
## - failures 1 521.35 3474.4 875.05
##
## Step: AIC=817.23
## G3 ~ school + sex + age + address + famsize + Medu + Fedu + Mjob +
## Fjob + traveltime + studytime + failures + schoolsup + paid +
## activities + nursery + higher + internet + romantic + famrel +
## freetime + goout + Dalc + health + absences
##
## Df Sum of Sq RSS AIC
## - Fedu 1 1.31 2954.8 815.39
## - freetime 1 1.33 2954.8 815.39
## - nursery 1 1.40 2954.9 815.40
## - school 1 1.85 2955.4 815.45
## - traveltime 1 2.26 2955.8 815.50
## - activities 1 2.40 2955.9 815.52
## - Medu 1 2.73 2956.2 815.56
## - absences 1 3.77 2957.3 815.68
## - internet 1 4.90 2958.4 815.82
## - famrel 1 5.99 2959.5 815.95
## - Dalc 1 8.78 2962.3 816.29
## - romantic 1 11.77 2965.3 816.65
## - address 1 12.99 2966.5 816.79
## - paid 1 15.02 2968.5 817.04
## - famsize 1 15.07 2968.6 817.04
## <none> 2953.5 817.23
## - age 1 27.14 2980.6 818.49
## - higher 1 37.84 2991.3 819.76
## - Fjob 4 92.70 3046.2 820.23
## - health 1 48.01 3001.5 820.97
## - sex 1 61.73 3015.2 822.59
## - studytime 1 82.53 3036.0 825.04
## - goout 1 106.49 3060.0 827.84
## - Mjob 4 188.52 3142.0 831.26
## - schoolsup 1 138.99 3092.5 831.60
## - failures 1 521.00 3474.5 873.07
##
## Step: AIC=815.39
## G3 ~ school + sex + age + address + famsize + Medu + Mjob + Fjob +
## traveltime + studytime + failures + schoolsup + paid + activities +
## nursery + higher + internet + romantic + famrel + freetime +
## goout + Dalc + health + absences
##
## Df Sum of Sq RSS AIC
## - freetime 1 1.25 2956.1 813.54
## - nursery 1 1.31 2956.1 813.55
## - school 1 1.90 2956.7 813.62
## - activities 1 2.18 2957.0 813.65
## - traveltime 1 2.54 2957.3 813.69
## - absences 1 3.60 2958.4 813.82
## - internet 1 5.39 2960.2 814.04
## - famrel 1 6.00 2960.8 814.11
## - Medu 1 6.39 2961.2 814.16
## - Dalc 1 8.57 2963.4 814.42
## - romantic 1 11.64 2966.5 814.79
## - address 1 12.77 2967.6 814.92
## - famsize 1 14.76 2969.6 815.16
## - paid 1 14.89 2969.7 815.18
## <none> 2954.8 815.39
## - age 1 27.01 2981.8 816.63
## - higher 1 39.88 2994.7 818.16
## - health 1 47.22 3002.0 819.03
## - Fjob 4 106.85 3061.7 820.03
## - sex 1 61.64 3016.5 820.74
## - studytime 1 81.38 3036.2 823.06
## - goout 1 105.64 3060.4 825.89
## - Mjob 4 187.37 3142.2 829.28
## - schoolsup 1 138.22 3093.0 829.66
## - failures 1 536.22 3491.0 872.76
##
## Step: AIC=813.54
## G3 ~ school + sex + age + address + famsize + Medu + Mjob + Fjob +
## traveltime + studytime + failures + schoolsup + paid + activities +
## nursery + higher + internet + romantic + famrel + goout +
## Dalc + health + absences
##
## Df Sum of Sq RSS AIC
## - nursery 1 1.42 2957.5 811.71
## - activities 1 2.00 2958.1 811.78
## - school 1 2.12 2958.2 811.79
## - traveltime 1 2.70 2958.8 811.86
## - absences 1 3.51 2959.6 811.96
## - internet 1 5.27 2961.3 812.17
## - Medu 1 6.00 2962.1 812.26
## - famrel 1 7.19 2963.3 812.40
## - Dalc 1 9.52 2965.6 812.68
## - romantic 1 11.29 2967.4 812.90
## - address 1 12.97 2969.0 813.10
## - famsize 1 14.57 2970.6 813.29
## - paid 1 15.63 2971.7 813.42
## <none> 2956.1 813.54
## - age 1 28.66 2984.7 814.97
## - higher 1 40.14 2996.2 816.34
## - health 1 46.96 3003.0 817.15
## - Fjob 4 106.78 3062.9 818.17
## - sex 1 64.26 3020.3 819.20
## - studytime 1 80.50 3036.6 821.10
## - goout 1 107.01 3063.1 824.20
## - Mjob 4 186.40 3142.5 827.31
## - schoolsup 1 140.40 3096.5 828.06
## - failures 1 535.33 3491.4 870.79
##
## Step: AIC=811.71
## G3 ~ school + sex + age + address + famsize + Medu + Mjob + Fjob +
## traveltime + studytime + failures + schoolsup + paid + activities +
## higher + internet + romantic + famrel + goout + Dalc + health +
## absences
##
## Df Sum of Sq RSS AIC
## - activities 1 1.87 2959.4 809.94
## - school 1 2.40 2959.9 810.00
## - traveltime 1 2.93 2960.4 810.06
## - absences 1 3.49 2961.0 810.13
## - Medu 1 5.60 2963.1 810.38
## - internet 1 5.62 2963.1 810.39
## - famrel 1 7.00 2964.5 810.55
## - Dalc 1 10.37 2967.9 810.96
## - romantic 1 11.99 2969.5 811.15
## - address 1 12.80 2970.3 811.25
## - famsize 1 13.68 2971.2 811.35
## - paid 1 16.30 2973.8 811.67
## <none> 2957.5 811.71
## - age 1 28.52 2986.0 813.13
## - higher 1 39.72 2997.2 814.46
## - health 1 46.94 3004.4 815.32
## - Fjob 4 106.44 3063.9 816.30
## - sex 1 64.39 3021.9 817.38
## - studytime 1 79.78 3037.3 819.19
## - goout 1 109.85 3067.3 822.69
## - Mjob 4 185.69 3143.2 825.39
## - schoolsup 1 141.68 3099.2 826.37
## - failures 1 534.39 3491.9 868.84
##
## Step: AIC=809.94
## G3 ~ school + sex + age + address + famsize + Medu + Mjob + Fjob +
## traveltime + studytime + failures + schoolsup + paid + higher +
## internet + romantic + famrel + goout + Dalc + health + absences
##
## Df Sum of Sq RSS AIC
## - school 1 2.78 2962.1 808.27
## - traveltime 1 3.05 2962.4 808.30
## - absences 1 3.47 2962.8 808.35
## - Medu 1 5.20 2964.6 808.56
## - internet 1 5.53 2964.9 808.60
## - famrel 1 6.92 2966.3 808.77
## - Dalc 1 11.60 2971.0 809.33
## - romantic 1 12.67 2972.0 809.46
## - famsize 1 13.62 2973.0 809.57
## - address 1 14.41 2973.8 809.66
## - paid 1 15.84 2975.2 809.84
## <none> 2959.4 809.94
## - age 1 27.49 2986.8 811.23
## - higher 1 38.49 2997.8 812.54
## - health 1 47.45 3006.8 813.60
## - Fjob 4 105.83 3065.2 814.44
## - sex 1 62.53 3021.9 815.38
## - studytime 1 78.01 3037.4 817.20
## - goout 1 113.35 3072.7 821.32
## - Mjob 4 187.25 3146.6 823.78
## - schoolsup 1 143.11 3102.5 824.75
## - failures 1 533.88 3493.2 866.98
##
## Step: AIC=808.27
## G3 ~ sex + age + address + famsize + Medu + Mjob + Fjob + traveltime +
## studytime + failures + schoolsup + paid + higher + internet +
## romantic + famrel + goout + Dalc + health + absences
##
## Df Sum of Sq RSS AIC
## - traveltime 1 2.13 2964.3 806.53
## - absences 1 2.43 2964.6 806.56
## - internet 1 5.03 2967.2 806.87
## - Medu 1 5.06 2967.2 806.88
## - famrel 1 6.45 2968.6 807.04
## - romantic 1 12.07 2974.2 807.72
## - Dalc 1 12.16 2974.3 807.73
## - address 1 12.72 2974.8 807.79
## - famsize 1 14.19 2976.3 807.97
## - paid 1 15.76 2977.9 808.16
## <none> 2962.1 808.27
## - age 1 24.72 2986.9 809.23
## - higher 1 39.91 3002.0 811.03
## - health 1 47.94 3010.1 811.99
## - Fjob 4 107.59 3069.7 812.97
## - sex 1 61.35 3023.5 813.57
## - studytime 1 75.49 3037.6 815.23
## - goout 1 112.53 3074.7 819.54
## - Mjob 4 185.72 3147.8 821.92
## - schoolsup 1 145.10 3107.2 823.29
## - failures 1 543.96 3506.1 866.29
##
## Step: AIC=806.53
## G3 ~ sex + age + address + famsize + Medu + Mjob + Fjob + studytime +
## failures + schoolsup + paid + higher + internet + romantic +
## famrel + goout + Dalc + health + absences
##
## Df Sum of Sq RSS AIC
## - absences 1 2.43 2966.7 804.82
## - internet 1 4.82 2969.1 805.10
## - Medu 1 5.34 2969.6 805.17
## - famrel 1 6.75 2971.0 805.33
## - Dalc 1 11.41 2975.7 805.89
## - romantic 1 11.86 2976.1 805.95
## - famsize 1 13.16 2977.4 806.10
## - paid 1 15.56 2979.8 806.39
## - address 1 16.61 2980.9 806.51
## <none> 2964.3 806.53
## - age 1 25.06 2989.3 807.52
## - higher 1 40.28 3004.5 809.33
## - health 1 47.52 3011.8 810.19
## - Fjob 4 108.57 3072.8 811.33
## - sex 1 61.13 3025.4 811.79
## - studytime 1 77.04 3041.3 813.66
## - goout 1 112.23 3076.5 817.76
## - Mjob 4 187.57 3151.8 820.37
## - schoolsup 1 144.38 3108.6 821.46
## - failures 1 543.41 3507.7 864.45
##
## Step: AIC=804.82
## G3 ~ sex + age + address + famsize + Medu + Mjob + Fjob + studytime +
## failures + schoolsup + paid + higher + internet + romantic +
## famrel + goout + Dalc + health
##
## Df Sum of Sq RSS AIC
## - internet 1 5.72 2972.4 803.50
## - famrel 1 6.24 2972.9 803.56
## - Medu 1 6.51 2973.2 803.60
## - romantic 1 11.06 2977.8 804.14
## - Dalc 1 11.76 2978.4 804.22
## - famsize 1 13.16 2979.9 804.39
## - paid 1 15.62 2982.3 804.69
## - address 1 16.05 2982.7 804.74
## <none> 2966.7 804.82
## - age 1 22.90 2989.6 805.55
## - higher 1 39.40 3006.1 807.51
## - health 1 46.96 3013.7 808.41
## - Fjob 4 107.09 3073.8 809.44
## - sex 1 59.15 3025.8 809.84
## - studytime 1 75.28 3042.0 811.74
## - goout 1 113.04 3079.7 816.13
## - Mjob 4 187.06 3153.8 818.59
## - schoolsup 1 142.33 3109.0 819.50
## - failures 1 541.35 3508.0 862.49
##
## Step: AIC=803.5
## G3 ~ sex + age + address + famsize + Medu + Mjob + Fjob + studytime +
## failures + schoolsup + paid + higher + romantic + famrel +
## goout + Dalc + health
##
## Df Sum of Sq RSS AIC
## - Medu 1 6.54 2978.9 802.28
## - famrel 1 6.67 2979.1 802.30
## - romantic 1 9.67 2982.1 802.66
## - Dalc 1 12.20 2984.6 802.96
## - famsize 1 12.84 2985.2 803.04
## - paid 1 14.27 2986.7 803.21
## <none> 2972.4 803.50
## - address 1 19.22 2991.6 803.80
## - age 1 25.24 2997.7 804.51
## - higher 1 37.59 3010.0 805.98
## - health 1 48.85 3021.3 807.30
## - Fjob 4 105.52 3077.9 807.92
## - sex 1 61.57 3034.0 808.80
## - studytime 1 78.74 3051.1 810.81
## - goout 1 109.60 3082.0 814.39
## - Mjob 4 190.29 3162.7 817.59
## - schoolsup 1 142.16 3114.6 818.13
## - failures 1 540.94 3513.4 861.02
##
## Step: AIC=802.28
## G3 ~ sex + age + address + famsize + Mjob + Fjob + studytime +
## failures + schoolsup + paid + higher + romantic + famrel +
## goout + Dalc + health
##
## Df Sum of Sq RSS AIC
## - famrel 1 7.26 2986.2 801.15
## - romantic 1 7.48 2986.4 801.18
## - famsize 1 10.74 2989.7 801.57
## - paid 1 14.01 2993.0 801.95
## - Dalc 1 14.05 2993.0 801.96
## <none> 2978.9 802.28
## - address 1 20.58 2999.5 802.74
## - age 1 27.51 3006.5 803.56
## - higher 1 38.33 3017.3 804.84
## - health 1 54.82 3033.8 806.78
## - sex 1 60.07 3039.0 807.39
## - Fjob 4 114.99 3093.9 807.77
## - studytime 1 81.40 3060.3 809.88
## - goout 1 107.60 3086.6 812.92
## - schoolsup 1 142.39 3121.3 816.91
## - Mjob 4 216.89 3195.8 819.30
## - failures 1 563.27 3542.2 861.94
##
## Step: AIC=801.15
## G3 ~ sex + age + address + famsize + Mjob + Fjob + studytime +
## failures + schoolsup + paid + higher + romantic + goout +
## Dalc + health
##
## Df Sum of Sq RSS AIC
## - romantic 1 8.96 2995.2 800.22
## - famsize 1 10.18 2996.4 800.36
## - Dalc 1 11.83 2998.0 800.56
## - paid 1 13.65 2999.9 800.77
## <none> 2986.2 801.15
## - address 1 22.50 3008.7 801.82
## - age 1 24.71 3010.9 802.08
## - higher 1 39.20 3025.4 803.79
## - health 1 50.49 3036.7 805.12
## - Fjob 4 111.63 3097.8 806.22
## - sex 1 65.30 3051.5 806.85
## - studytime 1 84.88 3071.1 809.13
## - goout 1 105.90 3092.1 811.56
## - schoolsup 1 139.26 3125.5 815.38
## - Mjob 4 215.45 3201.7 817.95
## - failures 1 573.18 3559.4 861.66
##
## Step: AIC=800.22
## G3 ~ sex + age + address + famsize + Mjob + Fjob + studytime +
## failures + schoolsup + paid + higher + goout + Dalc + health
##
## Df Sum of Sq RSS AIC
## - famsize 1 9.65 3004.8 799.36
## - Dalc 1 11.90 3007.1 799.63
## - paid 1 14.32 3009.5 799.92
## <none> 2995.2 800.22
## - address 1 21.65 3016.8 800.78
## - age 1 30.02 3025.2 801.77
## - higher 1 43.41 3038.6 803.34
## - health 1 52.76 3047.9 804.43
## - Fjob 4 109.49 3104.7 805.00
## - sex 1 69.62 3064.8 806.40
## - studytime 1 82.12 3077.3 807.85
## - goout 1 105.16 3100.3 810.50
## - schoolsup 1 136.67 3131.8 814.10
## - Mjob 4 218.00 3213.2 817.23
## - failures 1 586.40 3581.6 861.87
##
## Step: AIC=799.36
## G3 ~ sex + age + address + Mjob + Fjob + studytime + failures +
## schoolsup + paid + higher + goout + Dalc + health
##
## Df Sum of Sq RSS AIC
## - Dalc 1 14.59 3019.4 799.09
## - paid 1 15.25 3020.1 799.16
## <none> 3004.8 799.36
## - address 1 25.08 3029.9 800.32
## - age 1 27.92 3032.7 800.65
## - higher 1 43.97 3048.8 802.53
## - Fjob 4 104.33 3109.1 803.51
## - health 1 53.97 3058.8 803.70
## - sex 1 72.06 3076.9 805.80
## - studytime 1 79.32 3084.1 806.64
## - goout 1 107.99 3112.8 809.93
## - schoolsup 1 134.74 3139.5 812.98
## - Mjob 4 225.64 3230.4 817.14
## - failures 1 591.64 3596.4 861.35
##
## Step: AIC=799.09
## G3 ~ sex + age + address + Mjob + Fjob + studytime + failures +
## schoolsup + paid + higher + goout + health
##
## Df Sum of Sq RSS AIC
## - paid 1 11.05 3030.5 798.39
## <none> 3019.4 799.09
## - address 1 20.74 3040.1 799.52
## - age 1 24.31 3043.7 799.94
## - higher 1 45.64 3065.0 802.43
## - health 1 50.91 3070.3 803.04
## - Fjob 4 109.18 3128.6 803.73
## - studytime 1 72.56 3092.0 805.54
## - sex 1 93.35 3112.7 807.93
## - goout 1 95.67 3115.1 808.19
## - schoolsup 1 134.63 3154.0 812.62
## - Mjob 4 216.43 3235.8 815.73
## - failures 1 578.39 3597.8 859.48
##
## Step: AIC=798.39
## G3 ~ sex + age + address + Mjob + Fjob + studytime + failures +
## schoolsup + higher + goout + health
##
## Df Sum of Sq RSS AIC
## <none> 3030.5 798.39
## - address 1 20.38 3050.8 798.77
## - age 1 25.24 3055.7 799.34
## - higher 1 41.25 3071.7 801.20
## - health 1 49.17 3079.6 802.12
## - Fjob 4 118.35 3148.8 804.03
## - studytime 1 67.84 3098.3 804.27
## - goout 1 98.06 3128.5 807.72
## - sex 1 100.81 3131.3 808.04
## - schoolsup 1 132.89 3163.3 811.67
## - Mjob 4 211.43 3241.9 814.40
## - failures 1 568.57 3599.0 857.60
summary(selectedMod)
##
## Call:
## lm(formula = G3 ~ sex + age + address + Mjob + Fjob + studytime +
## failures + schoolsup + higher + goout + health, data = d1_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.4922 -2.0420 0.0357 2.0262 6.7142
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.37165 2.94147 4.886 1.59e-06 ***
## sexM 1.17017 0.34898 3.353 0.000890 ***
## age -0.24210 0.14431 -1.678 0.094334 .
## addressU 0.60042 0.39821 1.508 0.132540
## Mjobhealth 1.80672 0.73655 2.453 0.014674 *
## Mjobother -0.03889 0.52128 -0.075 0.940568
## Mjobservices 1.64617 0.55377 2.973 0.003164 **
## Mjobteacher 0.06240 0.64052 0.097 0.922451
## Fjobhealth -0.31962 1.08923 -0.293 0.769369
## Fjobother -0.75014 0.77050 -0.974 0.330965
## Fjobservices -0.14713 0.79696 -0.185 0.853637
## Fjobteacher 1.54225 0.95766 1.610 0.108234
## studytime 0.56950 0.20704 2.751 0.006267 **
## failures -2.05304 0.25781 -7.963 2.57e-14 ***
## schoolsupyes -1.95247 0.50714 -3.850 0.000141 ***
## higheryes 1.83648 0.85619 2.145 0.032669 *
## goout -0.50138 0.15161 -3.307 0.001044 **
## health -0.27391 0.11696 -2.342 0.019770 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.994 on 338 degrees of freedom
## Multiple R-squared: 0.3906, Adjusted R-squared: 0.3599
## F-statistic: 12.74 on 17 and 338 DF, p-value: < 2.2e-16
sigm1 <- lm(G3~sex+age+Mjob+studytime+failures+schoolsup+higher+goout+health,data=d1_clean)
summary(sigm1)
##
## Call:
## lm(formula = G3 ~ sex + age + Mjob + studytime + failures + schoolsup +
## higher + goout + health, data = d1_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.6080 -1.9673 0.0355 2.2504 7.9983
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.513624 2.756777 5.265 2.48e-07 ***
## sexM 1.101512 0.353064 3.120 0.001963 **
## age -0.248582 0.143486 -1.732 0.084093 .
## Mjobhealth 2.013814 0.725710 2.775 0.005823 **
## Mjobother -0.009995 0.522717 -0.019 0.984755
## Mjobservices 1.966212 0.551694 3.564 0.000417 ***
## Mjobteacher 0.611118 0.631999 0.967 0.334245
## studytime 0.518637 0.208002 2.493 0.013123 *
## failures -2.104745 0.260452 -8.081 1.11e-14 ***
## schoolsupyes -1.771131 0.507593 -3.489 0.000547 ***
## higheryes 1.831547 0.867072 2.112 0.035380 *
## goout -0.479519 0.152743 -3.139 0.001840 **
## health -0.289778 0.118339 -2.449 0.014837 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.042 on 343 degrees of freedom
## Multiple R-squared: 0.3616, Adjusted R-squared: 0.3393
## F-statistic: 16.19 on 12 and 343 DF, p-value: < 2.2e-16
selected2Mod <- step(sigm1)
## Start: AIC=804.92
## G3 ~ sex + age + Mjob + studytime + failures + schoolsup + higher +
## goout + health
##
## Df Sum of Sq RSS AIC
## <none> 3174.5 804.92
## - age 1 27.78 3202.3 806.02
## - higher 1 41.30 3215.8 807.52
## - health 1 55.50 3230.0 809.09
## - studytime 1 57.54 3232.0 809.31
## - sex 1 90.08 3264.6 812.88
## - goout 1 91.22 3265.7 813.00
## - schoolsup 1 112.68 3287.2 815.33
## - Mjob 4 281.62 3456.1 827.18
## - failures 1 604.40 3778.9 864.96
summary(selected2Mod)
##
## Call:
## lm(formula = G3 ~ sex + age + Mjob + studytime + failures + schoolsup +
## higher + goout + health, data = d1_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.6080 -1.9673 0.0355 2.2504 7.9983
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.513624 2.756777 5.265 2.48e-07 ***
## sexM 1.101512 0.353064 3.120 0.001963 **
## age -0.248582 0.143486 -1.732 0.084093 .
## Mjobhealth 2.013814 0.725710 2.775 0.005823 **
## Mjobother -0.009995 0.522717 -0.019 0.984755
## Mjobservices 1.966212 0.551694 3.564 0.000417 ***
## Mjobteacher 0.611118 0.631999 0.967 0.334245
## studytime 0.518637 0.208002 2.493 0.013123 *
## failures -2.104745 0.260452 -8.081 1.11e-14 ***
## schoolsupyes -1.771131 0.507593 -3.489 0.000547 ***
## higheryes 1.831547 0.867072 2.112 0.035380 *
## goout -0.479519 0.152743 -3.139 0.001840 **
## health -0.289778 0.118339 -2.449 0.014837 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.042 on 343 degrees of freedom
## Multiple R-squared: 0.3616, Adjusted R-squared: 0.3393
## F-statistic: 16.19 on 12 and 343 DF, p-value: < 2.2e-16
sigm2 <- lm(G3~sex+Mjob+studytime+failures+schoolsup+higher+goout+health, data=d1_clean)
summary(sigm2)
##
## Call:
## lm(formula = G3 ~ sex + Mjob + studytime + failures + schoolsup +
## higher + goout + health, data = d1_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.6864 -2.0401 0.0763 2.1698 8.0103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.16573 1.14406 8.886 < 2e-16 ***
## sexM 1.13657 0.35351 3.215 0.001427 **
## Mjobhealth 2.15868 0.72297 2.986 0.003031 **
## Mjobother 0.01312 0.52406 0.025 0.980041
## Mjobservices 2.02633 0.55220 3.670 0.000282 ***
## Mjobteacher 0.65868 0.63324 1.040 0.298989
## studytime 0.50367 0.20843 2.417 0.016188 *
## failures -2.18818 0.25670 -8.524 4.92e-16 ***
## schoolsupyes -1.54185 0.49146 -3.137 0.001852 **
## higheryes 2.08059 0.85756 2.426 0.015772 *
## goout -0.50902 0.15223 -3.344 0.000918 ***
## health -0.28117 0.11858 -2.371 0.018280 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.051 on 344 degrees of freedom
## Multiple R-squared: 0.356, Adjusted R-squared: 0.3355
## F-statistic: 17.29 on 11 and 344 DF, p-value: < 2.2e-16
#best subsets selection
allsubs <- regsubsets(G3~., data=d1_clean)
summary(allsubs)
## Subset selection object
## Call: regsubsets.formula(G3 ~ ., data = d1_clean)
## 41 Variables (and intercept)
## Forced in Forced out
## schoolMS FALSE FALSE
## sexM FALSE FALSE
## age FALSE FALSE
## addressU FALSE FALSE
## famsizeLE3 FALSE FALSE
## PstatusT FALSE FALSE
## Medu FALSE FALSE
## Fedu FALSE FALSE
## Mjobhealth FALSE FALSE
## Mjobother FALSE FALSE
## Mjobservices FALSE FALSE
## Mjobteacher FALSE FALSE
## Fjobhealth FALSE FALSE
## Fjobother FALSE FALSE
## Fjobservices FALSE FALSE
## Fjobteacher FALSE FALSE
## reasonhome FALSE FALSE
## reasonother FALSE FALSE
## reasonreputation FALSE FALSE
## guardianmother FALSE FALSE
## guardianother FALSE FALSE
## traveltime FALSE FALSE
## studytime FALSE FALSE
## failures FALSE FALSE
## schoolsupyes FALSE FALSE
## famsupyes FALSE FALSE
## paidyes FALSE FALSE
## activitiesyes FALSE FALSE
## nurseryyes FALSE FALSE
## higheryes FALSE FALSE
## internetyes FALSE FALSE
## romanticyes FALSE FALSE
## famrel FALSE FALSE
## freetime FALSE FALSE
## goout FALSE FALSE
## Dalc FALSE FALSE
## Walc FALSE FALSE
## health FALSE FALSE
## absences FALSE FALSE
## G1 FALSE FALSE
## G2 FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## schoolMS sexM age addressU famsizeLE3 PstatusT Medu Fedu Mjobhealth
## 1 ( 1 ) " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " " " " " " " " " "
## 7 ( 1 ) " " " " " " " " " " " " " " " " " "
## 8 ( 1 ) " " " " " " " " " " " " " " " " " "
## Mjobother Mjobservices Mjobteacher Fjobhealth Fjobother Fjobservices
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " " " " "
## Fjobteacher reasonhome reasonother reasonreputation guardianmother
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " "
## 7 ( 1 ) " " " " " " " " " "
## 8 ( 1 ) " " " " " " " " " "
## guardianother traveltime studytime failures schoolsupyes famsupyes
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " "*" " " " "
## 3 ( 1 ) " " " " " " "*" " " " "
## 4 ( 1 ) " " " " " " "*" " " " "
## 5 ( 1 ) " " " " " " "*" " " " "
## 6 ( 1 ) " " " " " " "*" " " " "
## 7 ( 1 ) " " " " " " "*" " " " "
## 8 ( 1 ) " " " " " " "*" " " " "
## paidyes activitiesyes nurseryyes higheryes internetyes romanticyes
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " "*"
## 6 ( 1 ) " " " " " " " " " " "*"
## 7 ( 1 ) " " " " " " " " " " "*"
## 8 ( 1 ) " " "*" " " " " " " "*"
## famrel freetime goout Dalc Walc health absences G1 G2
## 1 ( 1 ) " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) "*" " " " " " " " " " " " " " " "*"
## 4 ( 1 ) "*" " " " " " " " " " " "*" " " "*"
## 5 ( 1 ) "*" " " " " " " " " " " "*" " " "*"
## 6 ( 1 ) "*" " " " " " " " " " " "*" "*" "*"
## 7 ( 1 ) "*" " " " " " " " " " " "*" "*" "*"
## 8 ( 1 ) "*" " " " " " " " " " " "*" "*" "*"
sigm3 <- lm(G3~Mjob+Fjob+studytime+failures+schoolsup+famsup+goout,data=d1_clean)
summary(sigm3)
##
## Call:
## lm(formula = G3 ~ Mjob + Fjob + studytime + failures + schoolsup +
## famsup + goout, data = d1_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.0968 -1.8787 0.1611 1.8979 7.3466
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.17157 1.03148 11.800 < 2e-16 ***
## Mjobhealth 2.50274 0.73150 3.421 0.000698 ***
## Mjobother 0.29912 0.51776 0.578 0.563832
## Mjobservices 2.19123 0.54754 4.002 7.7e-05 ***
## Mjobteacher 0.83011 0.63198 1.313 0.189896
## Fjobhealth -0.05973 1.10010 -0.054 0.956728
## Fjobother -0.46987 0.77893 -0.603 0.546763
## Fjobservices -0.05002 0.80831 -0.062 0.950693
## Fjobteacher 1.84596 0.97359 1.896 0.058800 .
## studytime 0.53893 0.20379 2.645 0.008556 **
## failures -2.29565 0.25368 -9.049 < 2e-16 ***
## schoolsupyes -1.69588 0.49422 -3.431 0.000674 ***
## famsupyes -1.01961 0.34523 -2.953 0.003360 **
## goout -0.48114 0.15209 -3.163 0.001699 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.051 on 342 degrees of freedom
## Multiple R-squared: 0.3597, Adjusted R-squared: 0.3354
## F-statistic: 14.78 on 13 and 342 DF, p-value: < 2.2e-16
summ <- summary(sigm3) # model summary
pvals <- summ[[4]][, 4] # get all p values
significant <- character() # init variables that aren't statsitically significant
significant <- names(which(pvals < 0.5))
significant <- significant[!significant %in% "(Intercept)"]
show(significant)
## [1] "Mjobhealth" "Mjobservices" "Mjobteacher" "Fjobteacher" "studytime"
## [6] "failures" "schoolsupyes" "famsupyes" "goout"
#Random Forest method
##10-fold CV
nfolds <- 10
fold=createFolds(1:nrow(d1_clean), k=nfolds, list=FALSE)
mlist <- c(5,6,7,8,9,10)
msetemp <- rep(NA, 10)
mselist <- rep(NA, 6)
for(i in 1:length(mlist)){
mvalue <- mlist[i]
for(j in 1:nfolds){
pred_rf <- randomForest(G3~., data=d1_clean[fold==j,], mtry=mvalue, ntree=2000, importance=TRUE)
msetemp[j] <- mean((d1_clean[fold==j,31]-predict(pred_rf, d1_clean[fold==j,], type="response"))^2)
}
mselist[i] = mean(msetemp)
}
head(mselist)
## [1] 1.948078 1.886288 1.841493 1.799659 1.821110 1.807417
rf <- randomForest(G3~., data=d1_clean, mtry=10, ntree=2000, importance=TRUE)
length(rf$predicted)
## [1] 356
length(d1_clean$G3)
## [1] 356
plot(d1_clean$G3, predict(rf, d1_clean, type="response"), xlab="actual observation", ylab="Predicted", main="Random Forest Prediction on Mathematics")

varImpPlot(rf, sort=TRUE, n.var=8, main="Variable Importance Plot Mathematics")

#Portuguese data
##Fit a full linear regression model
d22 <- d2[,-c(31,32)]
m2 <- lm(G3~., data=d22)
summary(m2)
##
## Call:
## lm(formula = G3 ~ ., data = d22)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.8142 -1.3859 0.0094 1.5635 7.6487
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.68148 1.98532 4.373 1.44e-05 ***
## schoolMS -1.20033 0.26732 -4.490 8.51e-06 ***
## sexM -0.63306 0.25002 -2.532 0.011590 *
## age 0.15616 0.10219 1.528 0.127000
## addressU 0.32272 0.26181 1.233 0.218192
## famsizeLE3 0.30253 0.24502 1.235 0.217426
## PstatusT 0.17687 0.34669 0.510 0.610113
## Medu 0.03528 0.15134 0.233 0.815770
## Fedu 0.16686 0.13776 1.211 0.226295
## Mjobhealth 0.90149 0.53751 1.677 0.094023 .
## Mjobother 0.05042 0.30293 0.166 0.867868
## Mjobservices 0.42055 0.37309 1.127 0.260104
## Mjobteacher 0.51183 0.50191 1.020 0.308250
## Fjobhealth -0.61218 0.75234 -0.814 0.416136
## Fjobother -0.18438 0.45619 -0.404 0.686228
## Fjobservices -0.64339 0.47923 -1.343 0.179916
## Fjobteacher 0.57968 0.67224 0.862 0.388854
## reasonhome 0.05052 0.28491 0.177 0.859323
## reasonother -0.43494 0.36763 -1.183 0.237232
## reasonreputation 0.21767 0.29800 0.730 0.465403
## guardianmother -0.33847 0.26516 -1.276 0.202271
## guardianother 0.10499 0.53168 0.197 0.843529
## traveltime 0.06249 0.15915 0.393 0.694707
## studytime 0.40668 0.13994 2.906 0.003793 **
## failures -1.41221 0.20450 -6.906 1.26e-11 ***
## schoolsupyes -1.31116 0.36405 -3.602 0.000342 ***
## famsupyes -0.02037 0.22829 -0.089 0.928938
## paidyes -0.37159 0.46142 -0.805 0.420957
## activitiesyes 0.21915 0.22341 0.981 0.327000
## nurseryyes -0.21605 0.27139 -0.796 0.426291
## higheryes 1.73300 0.38274 4.528 7.17e-06 ***
## internetyes 0.25287 0.27631 0.915 0.360465
## romanticyes -0.43156 0.22922 -1.883 0.060217 .
## famrel 0.16155 0.11612 1.391 0.164640
## freetime -0.13777 0.11234 -1.226 0.220520
## goout -0.06606 0.10748 -0.615 0.539012
## Dalc -0.20478 0.15306 -1.338 0.181426
## Walc -0.08148 0.11846 -0.688 0.491824
## health -0.18745 0.07720 -2.428 0.015468 *
## absences -0.03807 0.02486 -1.531 0.126295
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.665 on 609 degrees of freedom
## Multiple R-squared: 0.3603, Adjusted R-squared: 0.3194
## F-statistic: 8.797 on 39 and 609 DF, p-value: < 2.2e-16
plot(m2)




cooksD <- cooks.distance(m2)
influential <- cooksD[(cooksD>(3*mean(cooksD, na.rm=TRUE)))]
names_of_influential <- names(influential)
df <- d22[names_of_influential,]
d2_clean <- d22 %>% anti_join(df)
## Joining with `by = join_by(school, sex, age, address, famsize, Pstatus, Medu,
## Fedu, Mjob, Fjob, reason, guardian, traveltime, studytime, failures, schoolsup,
## famsup, paid, activities, nursery, higher, internet, romantic, famrel,
## freetime, goout, Dalc, Walc, health, absences, G3)`
m2c <- lm(G3~., data=d2_clean)
plot(m2c)




response_df2 <- d2_clean['G3'] # Y variable
predictors_df2 <- d2_clean[, !names(d2_clean) %in% "G3" ] # X variables
hist(d2_clean$G3, main="G3 Grades for Portuguese", xlab="G3")

#Stepwise selection
selectedMod21 <- step(m2c)
## Start: AIC=821.02
## G3 ~ school + sex + age + address + famsize + Pstatus + Medu +
## Fedu + Mjob + Fjob + reason + guardian + traveltime + studytime +
## failures + schoolsup + famsup + paid + activities + nursery +
## higher + internet + romantic + famrel + freetime + goout +
## Dalc + Walc + health + absences
##
## Df Sum of Sq RSS AIC
## - Mjob 4 13.282 2072.4 816.91
## - guardian 2 6.425 2065.6 818.91
## - traveltime 1 0.004 2059.2 819.02
## - Pstatus 1 0.035 2059.2 819.03
## - nursery 1 0.088 2059.3 819.05
## - address 1 0.547 2059.7 819.18
## - Walc 1 1.138 2060.3 819.36
## - famsup 1 1.196 2060.4 819.37
## - Fedu 1 3.589 2062.8 820.08
## - famsize 1 3.858 2063.0 820.15
## - reason 3 18.831 2078.0 820.53
## - Dalc 1 5.160 2064.3 820.54
## <none> 2059.2 821.02
## - goout 1 8.630 2067.8 821.55
## - internet 1 8.931 2068.1 821.64
## - Medu 1 9.333 2068.5 821.76
## - romantic 1 10.285 2069.5 822.04
## - freetime 1 11.961 2071.1 822.53
## - paid 1 13.799 2073.0 823.06
## - activities 1 14.984 2074.2 823.41
## - health 1 21.990 2081.2 825.45
## - Fjob 4 44.122 2103.3 825.85
## - famrel 1 29.186 2088.4 827.54
## - age 1 42.909 2102.1 831.50
## - sex 1 45.768 2104.9 832.32
## - school 1 50.819 2110.0 833.77
## - studytime 1 56.520 2115.7 835.40
## - absences 1 58.886 2118.1 836.08
## - higher 1 99.347 2158.5 847.53
## - schoolsup 1 122.136 2181.3 853.88
## - failures 1 214.788 2274.0 879.05
##
## Step: AIC=816.91
## G3 ~ school + sex + age + address + famsize + Pstatus + Medu +
## Fedu + Fjob + reason + guardian + traveltime + studytime +
## failures + schoolsup + famsup + paid + activities + nursery +
## higher + internet + romantic + famrel + freetime + goout +
## Dalc + Walc + health + absences
##
## Df Sum of Sq RSS AIC
## - traveltime 1 0.001 2072.4 814.91
## - Pstatus 1 0.044 2072.5 814.92
## - nursery 1 0.056 2072.5 814.93
## - guardian 2 7.388 2079.8 815.06
## - Walc 1 0.821 2073.3 815.15
## - address 1 0.845 2073.3 815.16
## - famsup 1 1.176 2073.6 815.26
## - Fedu 1 2.394 2074.8 815.61
## - famsize 1 3.938 2076.4 816.06
## - Dalc 1 6.670 2079.1 816.86
## <none> 2072.4 816.91
## - goout 1 7.782 2080.2 817.18
## - reason 3 21.933 2094.4 817.28
## - romantic 1 9.528 2082.0 817.69
## - internet 1 11.255 2083.7 818.19
## - freetime 1 11.357 2083.8 818.22
## - activities 1 14.564 2087.0 819.15
## - paid 1 14.926 2087.4 819.25
## - health 1 19.635 2092.1 820.62
## - Fjob 4 42.997 2115.4 821.34
## - Medu 1 24.986 2097.4 822.16
## - famrel 1 26.158 2098.6 822.50
## - age 1 40.643 2113.1 826.66
## - sex 1 43.207 2115.7 827.40
## - school 1 52.476 2124.9 830.04
## - studytime 1 53.704 2126.2 830.39
## - absences 1 65.779 2138.2 833.82
## - higher 1 104.772 2177.2 844.75
## - schoolsup 1 128.366 2200.8 851.27
## - failures 1 210.746 2283.2 873.50
##
## Step: AIC=814.91
## G3 ~ school + sex + age + address + famsize + Pstatus + Medu +
## Fedu + Fjob + reason + guardian + studytime + failures +
## schoolsup + famsup + paid + activities + nursery + higher +
## internet + romantic + famrel + freetime + goout + Dalc +
## Walc + health + absences
##
## Df Sum of Sq RSS AIC
## - Pstatus 1 0.044 2072.5 812.92
## - nursery 1 0.056 2072.5 812.93
## - guardian 2 7.387 2079.8 813.06
## - Walc 1 0.822 2073.3 813.15
## - address 1 0.907 2073.4 813.18
## - famsup 1 1.175 2073.6 813.26
## - Fedu 1 2.410 2074.9 813.62
## - famsize 1 3.937 2076.4 814.06
## - Dalc 1 6.676 2079.1 814.86
## <none> 2072.4 814.91
## - goout 1 7.826 2080.3 815.19
## - reason 3 22.104 2094.6 815.33
## - romantic 1 9.527 2082.0 815.69
## - internet 1 11.376 2083.8 816.22
## - freetime 1 11.408 2083.9 816.23
## - activities 1 14.567 2087.0 817.15
## - paid 1 14.944 2087.4 817.26
## - health 1 19.708 2092.2 818.64
## - Fjob 4 43.109 2115.6 819.37
## - Medu 1 25.294 2097.8 820.25
## - famrel 1 26.173 2098.6 820.50
## - age 1 40.758 2113.2 824.69
## - sex 1 43.900 2116.3 825.59
## - school 1 53.057 2125.5 828.21
## - studytime 1 53.749 2126.2 828.40
## - absences 1 65.852 2138.3 831.84
## - higher 1 104.850 2177.3 842.77
## - schoolsup 1 128.634 2201.1 849.34
## - failures 1 211.150 2283.6 871.61
##
## Step: AIC=812.92
## G3 ~ school + sex + age + address + famsize + Medu + Fedu + Fjob +
## reason + guardian + studytime + failures + schoolsup + famsup +
## paid + activities + nursery + higher + internet + romantic +
## famrel + freetime + goout + Dalc + Walc + health + absences
##
## Df Sum of Sq RSS AIC
## - nursery 1 0.056 2072.6 810.94
## - guardian 2 7.659 2080.2 811.16
## - Walc 1 0.802 2073.3 811.16
## - address 1 0.878 2073.4 811.18
## - famsup 1 1.160 2073.7 811.26
## - Fedu 1 2.390 2074.9 811.62
## - famsize 1 3.982 2076.5 812.09
## - Dalc 1 6.691 2079.2 812.87
## <none> 2072.5 812.92
## - goout 1 7.839 2080.3 813.21
## - reason 3 22.062 2094.6 813.33
## - romantic 1 9.632 2082.1 813.73
## - freetime 1 11.375 2083.9 814.24
## - internet 1 11.544 2084.0 814.29
## - activities 1 14.950 2087.4 815.27
## - paid 1 14.959 2087.5 815.28
## - health 1 19.728 2092.2 816.66
## - Fjob 4 43.071 2115.6 817.37
## - Medu 1 25.254 2097.8 818.25
## - famrel 1 26.284 2098.8 818.55
## - age 1 41.301 2113.8 822.86
## - sex 1 43.865 2116.4 823.60
## - school 1 53.290 2125.8 826.28
## - studytime 1 53.730 2126.2 826.41
## - absences 1 67.210 2139.7 830.23
## - higher 1 105.081 2177.6 840.85
## - schoolsup 1 128.706 2201.2 847.38
## - failures 1 211.260 2283.8 869.65
##
## Step: AIC=810.94
## G3 ~ school + sex + age + address + famsize + Medu + Fedu + Fjob +
## reason + guardian + studytime + failures + schoolsup + famsup +
## paid + activities + higher + internet + romantic + famrel +
## freetime + goout + Dalc + Walc + health + absences
##
## Df Sum of Sq RSS AIC
## - guardian 2 7.642 2080.2 809.17
## - Walc 1 0.817 2073.4 809.18
## - address 1 0.874 2073.4 809.20
## - famsup 1 1.148 2073.7 809.28
## - Fedu 1 2.399 2074.9 809.64
## - famsize 1 4.121 2076.7 810.14
## - Dalc 1 6.716 2079.3 810.90
## <none> 2072.6 810.94
## - goout 1 7.805 2080.4 811.22
## - reason 3 22.055 2094.6 811.35
## - romantic 1 9.645 2082.2 811.75
## - freetime 1 11.415 2084.0 812.26
## - internet 1 11.512 2084.1 812.29
## - paid 1 14.938 2087.5 813.29
## - activities 1 15.087 2087.6 813.33
## - health 1 19.694 2092.2 814.66
## - Fjob 4 43.048 2115.6 815.38
## - Medu 1 25.726 2098.3 816.40
## - famrel 1 26.355 2098.9 816.59
## - age 1 41.579 2114.1 820.96
## - sex 1 44.013 2116.6 821.65
## - school 1 53.234 2125.8 824.28
## - studytime 1 53.725 2126.3 824.42
## - absences 1 67.308 2139.9 828.28
## - higher 1 105.032 2177.6 838.85
## - schoolsup 1 128.684 2201.2 845.39
## - failures 1 212.796 2285.3 868.07
##
## Step: AIC=809.17
## G3 ~ school + sex + age + address + famsize + Medu + Fedu + Fjob +
## reason + studytime + failures + schoolsup + famsup + paid +
## activities + higher + internet + romantic + famrel + freetime +
## goout + Dalc + Walc + health + absences
##
## Df Sum of Sq RSS AIC
## - Walc 1 0.818 2081.0 807.41
## - address 1 0.946 2081.1 807.44
## - famsup 1 1.111 2081.3 807.49
## - famsize 1 3.856 2084.1 808.29
## - Fedu 1 3.934 2084.1 808.31
## - Dalc 1 6.324 2086.5 809.00
## <none> 2080.2 809.17
## - goout 1 8.824 2089.0 809.73
## - reason 3 23.423 2103.6 809.94
## - romantic 1 9.903 2090.1 810.04
## - freetime 1 12.169 2092.4 810.70
## - internet 1 12.666 2092.9 810.84
## - activities 1 14.584 2094.8 811.39
## - paid 1 16.850 2097.1 812.05
## - health 1 19.929 2100.1 812.94
## - Fjob 4 42.341 2122.5 813.36
## - Medu 1 22.303 2102.5 813.62
## - famrel 1 26.830 2107.0 814.92
## - age 1 40.502 2120.7 818.83
## - sex 1 42.236 2122.4 819.33
## - school 1 50.790 2131.0 821.76
## - studytime 1 54.488 2134.7 822.81
## - absences 1 71.127 2151.3 827.51
## - higher 1 105.071 2185.3 836.98
## - schoolsup 1 127.954 2208.2 843.28
## - failures 1 213.834 2294.0 866.37
##
## Step: AIC=807.41
## G3 ~ school + sex + age + address + famsize + Medu + Fedu + Fjob +
## reason + studytime + failures + schoolsup + famsup + paid +
## activities + higher + internet + romantic + famrel + freetime +
## goout + Dalc + health + absences
##
## Df Sum of Sq RSS AIC
## - address 1 0.993 2082.0 805.69
## - famsup 1 1.042 2082.1 805.71
## - Fedu 1 3.604 2084.6 806.45
## - famsize 1 3.661 2084.7 806.47
## <none> 2081.0 807.41
## - reason 3 23.097 2104.1 808.08
## - romantic 1 9.726 2090.7 808.23
## - freetime 1 11.921 2092.9 808.86
## - Dalc 1 11.939 2092.9 808.87
## - goout 1 11.997 2093.0 808.88
## - internet 1 12.633 2093.7 809.07
## - activities 1 14.853 2095.9 809.71
## - paid 1 17.124 2098.1 810.36
## - health 1 21.340 2102.3 811.58
## - Medu 1 23.197 2104.2 812.11
## - Fjob 4 45.301 2126.3 812.43
## - famrel 1 28.626 2109.6 813.67
## - age 1 40.680 2121.7 817.12
## - sex 1 45.613 2126.6 818.52
## - school 1 50.281 2131.3 819.85
## - studytime 1 57.018 2138.0 821.76
## - absences 1 72.242 2153.2 826.05
## - higher 1 104.379 2185.4 835.01
## - schoolsup 1 127.137 2208.2 841.28
## - failures 1 213.748 2294.8 864.56
##
## Step: AIC=805.69
## G3 ~ school + sex + age + famsize + Medu + Fedu + Fjob + reason +
## studytime + failures + schoolsup + famsup + paid + activities +
## higher + internet + romantic + famrel + freetime + goout +
## Dalc + health + absences
##
## Df Sum of Sq RSS AIC
## - famsup 1 1.140 2083.2 804.03
## - Fedu 1 3.714 2085.7 804.77
## - famsize 1 3.927 2085.9 804.83
## <none> 2082.0 805.69
## - reason 3 23.156 2105.2 806.39
## - romantic 1 9.869 2091.9 806.56
## - freetime 1 11.882 2093.9 807.14
## - goout 1 11.908 2093.9 807.14
## - Dalc 1 12.473 2094.5 807.31
## - internet 1 13.298 2095.3 807.55
## - activities 1 14.458 2096.5 807.88
## - paid 1 17.585 2099.6 808.78
## - health 1 21.367 2103.4 809.87
## - Fjob 4 44.911 2126.9 810.61
## - Medu 1 24.065 2106.1 810.65
## - famrel 1 28.079 2110.1 811.80
## - age 1 40.925 2122.9 815.47
## - sex 1 45.681 2127.7 816.82
## - studytime 1 57.100 2139.1 820.06
## - school 1 59.466 2141.5 820.73
## - absences 1 72.019 2154.0 824.27
## - higher 1 105.133 2187.1 833.50
## - schoolsup 1 127.816 2209.8 839.74
## - failures 1 213.131 2295.1 862.66
##
## Step: AIC=804.03
## G3 ~ school + sex + age + famsize + Medu + Fedu + Fjob + reason +
## studytime + failures + schoolsup + paid + activities + higher +
## internet + romantic + famrel + freetime + goout + Dalc +
## health + absences
##
## Df Sum of Sq RSS AIC
## - Fedu 1 3.515 2086.7 803.05
## - famsize 1 4.056 2087.2 803.20
## <none> 2083.2 804.03
## - reason 3 22.727 2105.9 804.59
## - romantic 1 9.737 2092.9 804.85
## - goout 1 11.898 2095.0 805.47
## - freetime 1 11.936 2095.1 805.48
## - Dalc 1 12.922 2096.1 805.77
## - internet 1 12.930 2096.1 805.77
## - activities 1 14.958 2098.1 806.35
## - paid 1 18.428 2101.6 807.35
## - health 1 21.942 2105.1 808.36
## - Medu 1 23.554 2106.7 808.83
## - Fjob 4 44.784 2127.9 808.89
## - famrel 1 27.970 2111.1 810.09
## - age 1 43.007 2126.2 814.39
## - sex 1 44.556 2127.7 814.83
## - studytime 1 56.045 2139.2 818.09
## - school 1 59.976 2143.1 819.20
## - absences 1 72.734 2155.9 822.79
## - higher 1 104.729 2187.9 831.70
## - schoolsup 1 127.807 2210.9 838.05
## - failures 1 217.335 2300.5 862.07
##
## Step: AIC=803.05
## G3 ~ school + sex + age + famsize + Medu + Fjob + reason + studytime +
## failures + schoolsup + paid + activities + higher + internet +
## romantic + famrel + freetime + goout + Dalc + health + absences
##
## Df Sum of Sq RSS AIC
## - famsize 1 3.754 2090.4 802.13
## <none> 2086.7 803.05
## - reason 3 22.800 2109.5 803.62
## - romantic 1 10.640 2097.3 804.12
## - goout 1 11.507 2098.2 804.37
## - freetime 1 11.660 2098.3 804.42
## - Dalc 1 13.083 2099.7 804.83
## - internet 1 13.386 2100.1 804.91
## - activities 1 14.977 2101.6 805.37
## - paid 1 17.606 2104.3 806.13
## - health 1 21.334 2108.0 807.20
## - famrel 1 28.445 2115.1 809.24
## - Fjob 4 52.259 2138.9 810.01
## - age 1 43.133 2129.8 813.42
## - sex 1 44.115 2130.8 813.70
## - Medu 1 50.688 2137.3 815.57
## - studytime 1 55.155 2141.8 816.83
## - school 1 59.666 2146.3 818.10
## - absences 1 70.718 2157.4 821.21
## - higher 1 106.831 2193.5 831.25
## - schoolsup 1 126.044 2212.7 836.53
## - failures 1 221.312 2308.0 862.03
##
## Step: AIC=802.13
## G3 ~ school + sex + age + Medu + Fjob + reason + studytime +
## failures + schoolsup + paid + activities + higher + internet +
## romantic + famrel + freetime + goout + Dalc + health + absences
##
## Df Sum of Sq RSS AIC
## <none> 2090.4 802.13
## - reason 3 23.848 2114.3 803.00
## - romantic 1 10.788 2101.2 803.25
## - goout 1 11.740 2102.2 803.52
## - freetime 1 12.052 2102.5 803.61
## - Dalc 1 12.275 2102.7 803.68
## - internet 1 13.419 2103.8 804.00
## - activities 1 14.861 2105.3 804.42
## - paid 1 18.437 2108.8 805.45
## - health 1 21.240 2111.7 806.25
## - famrel 1 28.424 2118.8 808.30
## - Fjob 4 50.891 2141.3 808.69
## - sex 1 42.049 2132.5 812.18
## - age 1 43.878 2134.3 812.70
## - Medu 1 50.504 2140.9 814.58
## - studytime 1 54.811 2145.2 815.79
## - school 1 58.181 2148.6 816.74
## - absences 1 71.868 2162.3 820.58
## - higher 1 107.015 2197.4 830.34
## - schoolsup 1 126.876 2217.3 835.78
## - failures 1 227.171 2317.6 862.55
summary(selectedMod21)
##
## Call:
## lm(formula = G3 ~ school + sex + age + Medu + Fjob + reason +
## studytime + failures + schoolsup + paid + activities + higher +
## internet + romantic + famrel + freetime + goout + Dalc +
## health + absences, data = d2_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.4000 -1.3234 -0.0888 1.3319 5.4844
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.60115 1.37637 4.796 2.06e-06 ***
## schoolMS -0.76523 0.19063 -4.014 6.75e-05 ***
## sexM -0.61073 0.17896 -3.413 0.000688 ***
## age 0.25184 0.07224 3.486 0.000527 ***
## Medu 0.29679 0.07935 3.740 0.000202 ***
## Fjobhealth 0.23538 0.54177 0.434 0.664114
## Fjobother -0.13557 0.33331 -0.407 0.684356
## Fjobservices -0.36848 0.35068 -1.051 0.293814
## Fjobteacher 1.04922 0.47955 2.188 0.029073 *
## reasonhome 0.36365 0.20747 1.753 0.080169 .
## reasonother -0.02469 0.27448 -0.090 0.928342
## reasonreputation 0.47303 0.21358 2.215 0.027168 *
## studytime 0.39417 0.10116 3.896 0.000109 ***
## failures -1.24176 0.15654 -7.932 1.12e-14 ***
## schoolsupyes -1.59164 0.26849 -5.928 5.27e-09 ***
## paidyes -0.76440 0.33826 -2.260 0.024205 *
## activitiesyes 0.33105 0.16317 2.029 0.042935 *
## higheryes 1.55537 0.28568 5.444 7.69e-08 ***
## internetyes 0.38471 0.19955 1.928 0.054359 .
## romanticyes -0.28984 0.16767 -1.729 0.084411 .
## famrel 0.23912 0.08522 2.806 0.005187 **
## freetime -0.15347 0.08400 -1.827 0.068210 .
## goout -0.13572 0.07527 -1.803 0.071873 .
## Dalc -0.18284 0.09916 -1.844 0.065707 .
## health -0.13558 0.05590 -2.425 0.015593 *
## absences -0.08364 0.01875 -4.462 9.78e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.9 on 579 degrees of freedom
## Multiple R-squared: 0.4663, Adjusted R-squared: 0.4432
## F-statistic: 20.23 on 25 and 579 DF, p-value: < 2.2e-16
sigm21 <- lm(G3~sex+age+Medu+reason+Fjob+studytime+failures+schoolsup+paid+activities+internet+romantic+famrel+freetime+goout+health+absences,data=d2_clean)
summary(sigm21)
##
## Call:
## lm(formula = G3 ~ sex + age + Medu + reason + Fjob + studytime +
## failures + schoolsup + paid + activities + internet + romantic +
## famrel + freetime + goout + health + absences, data = d2_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.1229 -1.3318 -0.0946 1.2837 5.4003
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.97432 1.36361 5.848 8.30e-09 ***
## sexM -0.69949 0.17801 -3.929 9.54e-05 ***
## age 0.17116 0.07380 2.319 0.02073 *
## Medu 0.42002 0.08031 5.230 2.37e-07 ***
## reasonhome 0.47532 0.21437 2.217 0.02699 *
## reasonother -0.26681 0.28021 -0.952 0.34140
## reasonreputation 0.57658 0.22147 2.603 0.00947 **
## Fjobhealth 0.52233 0.56089 0.931 0.35212
## Fjobother 0.13661 0.34311 0.398 0.69066
## Fjobservices -0.16830 0.36192 -0.465 0.64210
## Fjobteacher 1.36871 0.49519 2.764 0.00589 **
## studytime 0.52701 0.10344 5.095 4.73e-07 ***
## failures -1.40982 0.16079 -8.768 < 2e-16 ***
## schoolsupyes -1.40709 0.27621 -5.094 4.74e-07 ***
## paidyes -0.82036 0.35157 -2.333 0.01997 *
## activitiesyes 0.38731 0.16941 2.286 0.02260 *
## internetyes 0.54833 0.20452 2.681 0.00755 **
## romanticyes -0.37064 0.17403 -2.130 0.03360 *
## famrel 0.27922 0.08794 3.175 0.00158 **
## freetime -0.19727 0.08716 -2.263 0.02398 *
## goout -0.18320 0.07637 -2.399 0.01676 *
## health -0.10819 0.05793 -1.868 0.06231 .
## absences -0.08664 0.01882 -4.604 5.08e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.977 on 582 degrees of freedom
## Multiple R-squared: 0.4193, Adjusted R-squared: 0.3974
## F-statistic: 19.1 on 22 and 582 DF, p-value: < 2.2e-16
selectedMod22 <- step(sigm21)
## Start: AIC=847.17
## G3 ~ sex + age + Medu + reason + Fjob + studytime + failures +
## schoolsup + paid + activities + internet + romantic + famrel +
## freetime + goout + health + absences
##
## Df Sum of Sq RSS AIC
## <none> 2274.4 847.17
## - health 1 13.632 2288.1 848.79
## - romantic 1 17.727 2292.2 849.87
## - freetime 1 20.021 2294.4 850.47
## - activities 1 20.425 2294.8 850.58
## - age 1 21.019 2295.4 850.74
## - paid 1 21.278 2295.7 850.81
## - goout 1 22.487 2296.9 851.12
## - internet 1 28.090 2302.5 852.60
## - reason 3 48.684 2323.1 853.99
## - Fjob 4 60.650 2335.1 855.09
## - famrel 1 39.394 2313.8 855.56
## - sex 1 60.342 2334.8 861.01
## - absences 1 82.852 2357.3 866.82
## - schoolsup 1 101.415 2375.8 871.56
## - studytime 1 101.431 2375.8 871.57
## - Medu 1 106.895 2381.3 872.96
## - failures 1 300.434 2574.8 920.23
summary(selectedMod22)
##
## Call:
## lm(formula = G3 ~ sex + age + Medu + reason + Fjob + studytime +
## failures + schoolsup + paid + activities + internet + romantic +
## famrel + freetime + goout + health + absences, data = d2_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.1229 -1.3318 -0.0946 1.2837 5.4003
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.97432 1.36361 5.848 8.30e-09 ***
## sexM -0.69949 0.17801 -3.929 9.54e-05 ***
## age 0.17116 0.07380 2.319 0.02073 *
## Medu 0.42002 0.08031 5.230 2.37e-07 ***
## reasonhome 0.47532 0.21437 2.217 0.02699 *
## reasonother -0.26681 0.28021 -0.952 0.34140
## reasonreputation 0.57658 0.22147 2.603 0.00947 **
## Fjobhealth 0.52233 0.56089 0.931 0.35212
## Fjobother 0.13661 0.34311 0.398 0.69066
## Fjobservices -0.16830 0.36192 -0.465 0.64210
## Fjobteacher 1.36871 0.49519 2.764 0.00589 **
## studytime 0.52701 0.10344 5.095 4.73e-07 ***
## failures -1.40982 0.16079 -8.768 < 2e-16 ***
## schoolsupyes -1.40709 0.27621 -5.094 4.74e-07 ***
## paidyes -0.82036 0.35157 -2.333 0.01997 *
## activitiesyes 0.38731 0.16941 2.286 0.02260 *
## internetyes 0.54833 0.20452 2.681 0.00755 **
## romanticyes -0.37064 0.17403 -2.130 0.03360 *
## famrel 0.27922 0.08794 3.175 0.00158 **
## freetime -0.19727 0.08716 -2.263 0.02398 *
## goout -0.18320 0.07637 -2.399 0.01676 *
## health -0.10819 0.05793 -1.868 0.06231 .
## absences -0.08664 0.01882 -4.604 5.08e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.977 on 582 degrees of freedom
## Multiple R-squared: 0.4193, Adjusted R-squared: 0.3974
## F-statistic: 19.1 on 22 and 582 DF, p-value: < 2.2e-16
sigm22 <- lm(G3~sex+age+Medu+reason+Fjob+studytime+failures+schoolsup+paid+activities+internet+romantic+famrel+freetime+goout+absences, data=d2_clean)
summary(sigm22)
##
## Call:
## lm(formula = G3 ~ sex + age + Medu + reason + Fjob + studytime +
## failures + schoolsup + paid + activities + internet + romantic +
## famrel + freetime + goout + absences, data = d2_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2110 -1.3170 -0.0422 1.2400 5.6566
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.71097 1.35919 5.673 2.21e-08 ***
## sexM -0.73985 0.17707 -4.178 3.39e-05 ***
## age 0.16868 0.07395 2.281 0.02291 *
## Medu 0.42280 0.08047 5.254 2.09e-07 ***
## reasonhome 0.49467 0.21458 2.305 0.02150 *
## reasonother -0.25258 0.28071 -0.900 0.36860
## reasonreputation 0.63113 0.22001 2.869 0.00427 **
## Fjobhealth 0.40472 0.55854 0.725 0.46898
## Fjobother 0.11551 0.34366 0.336 0.73691
## Fjobservices -0.16329 0.36268 -0.450 0.65272
## Fjobteacher 1.33718 0.49595 2.696 0.00722 **
## studytime 0.52948 0.10366 5.108 4.42e-07 ***
## failures -1.40979 0.16113 -8.749 < 2e-16 ***
## schoolsupyes -1.41881 0.27673 -5.127 4.01e-07 ***
## paidyes -0.86332 0.35156 -2.456 0.01435 *
## activitiesyes 0.38266 0.16976 2.254 0.02456 *
## internetyes 0.55993 0.20486 2.733 0.00646 **
## romanticyes -0.36747 0.17439 -2.107 0.03553 *
## famrel 0.26294 0.08770 2.998 0.00283 **
## freetime -0.20948 0.08710 -2.405 0.01648 *
## goout -0.17498 0.07641 -2.290 0.02237 *
## absences -0.08665 0.01886 -4.595 5.30e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.981 on 583 degrees of freedom
## Multiple R-squared: 0.4158, Adjusted R-squared: 0.3948
## F-statistic: 19.76 on 21 and 583 DF, p-value: < 2.2e-16
#best subsets selection
allsubs2 <- regsubsets(G3~., data=d2_clean)
summary(allsubs2)
## Subset selection object
## Call: regsubsets.formula(G3 ~ ., data = d2_clean)
## 39 Variables (and intercept)
## Forced in Forced out
## schoolMS FALSE FALSE
## sexM FALSE FALSE
## age FALSE FALSE
## addressU FALSE FALSE
## famsizeLE3 FALSE FALSE
## PstatusT FALSE FALSE
## Medu FALSE FALSE
## Fedu FALSE FALSE
## Mjobhealth FALSE FALSE
## Mjobother FALSE FALSE
## Mjobservices FALSE FALSE
## Mjobteacher FALSE FALSE
## Fjobhealth FALSE FALSE
## Fjobother FALSE FALSE
## Fjobservices FALSE FALSE
## Fjobteacher FALSE FALSE
## reasonhome FALSE FALSE
## reasonother FALSE FALSE
## reasonreputation FALSE FALSE
## guardianmother FALSE FALSE
## guardianother FALSE FALSE
## traveltime FALSE FALSE
## studytime FALSE FALSE
## failures FALSE FALSE
## schoolsupyes FALSE FALSE
## famsupyes FALSE FALSE
## paidyes FALSE FALSE
## activitiesyes FALSE FALSE
## nurseryyes FALSE FALSE
## higheryes FALSE FALSE
## internetyes FALSE FALSE
## romanticyes FALSE FALSE
## famrel FALSE FALSE
## freetime FALSE FALSE
## goout FALSE FALSE
## Dalc FALSE FALSE
## Walc FALSE FALSE
## health FALSE FALSE
## absences FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## schoolMS sexM age addressU famsizeLE3 PstatusT Medu Fedu Mjobhealth
## 1 ( 1 ) " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " "*" " " " "
## 4 ( 1 ) " " " " " " " " " " " " "*" " " " "
## 5 ( 1 ) " " " " " " " " " " " " "*" " " " "
## 6 ( 1 ) " " " " " " " " " " " " "*" " " " "
## 7 ( 1 ) "*" " " " " " " " " " " "*" " " " "
## 8 ( 1 ) "*" "*" " " " " " " " " "*" " " " "
## Mjobother Mjobservices Mjobteacher Fjobhealth Fjobother Fjobservices
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " " " "
## 7 ( 1 ) " " " " " " " " " " " "
## 8 ( 1 ) " " " " " " " " " " " "
## Fjobteacher reasonhome reasonother reasonreputation guardianmother
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " "
## 7 ( 1 ) " " " " " " " " " "
## 8 ( 1 ) " " " " " " " " " "
## guardianother traveltime studytime failures schoolsupyes famsupyes
## 1 ( 1 ) " " " " " " "*" " " " "
## 2 ( 1 ) " " " " " " "*" " " " "
## 3 ( 1 ) " " " " "*" "*" " " " "
## 4 ( 1 ) " " " " "*" "*" " " " "
## 5 ( 1 ) " " " " "*" "*" "*" " "
## 6 ( 1 ) " " " " "*" "*" "*" " "
## 7 ( 1 ) " " " " "*" "*" "*" " "
## 8 ( 1 ) " " " " "*" "*" "*" " "
## paidyes activitiesyes nurseryyes higheryes internetyes romanticyes
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " "*" " " " "
## 3 ( 1 ) " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " "*" " " " "
## 5 ( 1 ) " " " " " " "*" " " " "
## 6 ( 1 ) " " " " " " "*" " " " "
## 7 ( 1 ) " " " " " " "*" " " " "
## 8 ( 1 ) " " " " " " "*" " " " "
## famrel freetime goout Dalc Walc health absences
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " " " "
## 6 ( 1 ) " " " " " " "*" " " " " " "
## 7 ( 1 ) " " " " " " " " "*" " " " "
## 8 ( 1 ) " " " " " " " " " " " " "*"
sigm23 <- lm(G3~school+sex+Medu+studytime+failures+schoolsup+higher+absences,data=d2_clean)
summary(sigm23)
##
## Call:
## lm(formula = G3 ~ school + sex + Medu + studytime + failures +
## schoolsup + higher + absences, data = d2_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5745 -1.4500 -0.1443 1.4255 5.8460
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.34023 0.40705 25.403 < 2e-16 ***
## schoolMS -1.00702 0.18797 -5.357 1.21e-07 ***
## sexM -0.77902 0.17235 -4.520 7.46e-06 ***
## Medu 0.38118 0.07735 4.928 1.08e-06 ***
## studytime 0.46232 0.10326 4.477 9.07e-06 ***
## failures -1.29892 0.15530 -8.364 4.30e-16 ***
## schoolsupyes -1.68843 0.27114 -6.227 8.97e-10 ***
## higheryes 1.51497 0.29252 5.179 3.05e-07 ***
## absences -0.08721 0.01874 -4.653 4.03e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.988 on 596 degrees of freedom
## Multiple R-squared: 0.3988, Adjusted R-squared: 0.3907
## F-statistic: 49.42 on 8 and 596 DF, p-value: < 2.2e-16
summ2 <- summary(sigm23) # model summary
pvals <- summ2[[4]][, 4] # get all p values
significant2 <- character() # init variables that aren't statsitically significant
significant2 <- names(which(pvals < 0.5))
significant2 <- significant[!significant %in% "(Intercept)"]
show(significant2)
## [1] "Mjobhealth" "Mjobservices" "Mjobteacher" "Fjobteacher" "studytime"
## [6] "failures" "schoolsupyes" "famsupyes" "goout"
##10-fold CV
nfolds <- 10
fold2=createFolds(1:nrow(d2_clean),k=nfolds, list=FALSE)
mlist2 <- c(5,6,7,8,9,10)
msetemp2 <- rep(NA, 10)
mselist2 <- rep(NA, 6)
for(i in 1:length(mlist2)){
mvalue2 <- mlist2[i]
for(j in 1:nfolds){
pred_rf2 <- randomForest(G3~., data=d2_clean[fold2==j,], mtry=mvalue2, ntree=2000, importance=TRUE)
msetemp2[j] <- mean((d2_clean[fold2==j,31]-predict(pred_rf2, d2_clean[fold2==j,], type="response"))^2)
}
mselist2[i] = mean(msetemp)
}
head(mselist2)
## [1] 1.807417 1.807417 1.807417 1.807417 1.807417 1.807417
rf2 <- randomForest(G3~., data=d2_clean, mtry=10, ntree=2000, importance=TRUE)
rf2
##
## Call:
## randomForest(formula = G3 ~ ., data = d2_clean, mtry = 10, ntree = 2000, importance = TRUE)
## Type of random forest: regression
## Number of trees: 2000
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 3.695536
## % Var explained: 42.92
plot(rf2)

plot(d2_clean$G3, predict(rf2, d2_clean, type="response"), xlab="actual observation", ylab="Predicted", main="Random Forest Prediction on Portugese")

importance(rf2, type=1)
## %IncMSE
## school 18.0664467
## sex 13.3445122
## age 15.2529938
## address 9.1169507
## famsize -0.6528098
## Pstatus 3.1079510
## Medu 33.2634348
## Fedu 16.3226127
## Mjob 9.5370635
## Fjob 4.6989390
## reason 11.1969598
## guardian 3.8855024
## traveltime 7.1171375
## studytime 25.9266270
## failures 87.9260692
## schoolsup 37.1837215
## famsup 4.4550086
## paid 1.9551904
## activities 4.2210242
## nursery 3.7619894
## higher 53.3972202
## internet 12.4826857
## romantic 1.2542726
## famrel 11.2345019
## freetime 5.9535404
## goout 4.7948698
## Dalc 21.8644469
## Walc 18.5075529
## health 10.1653974
## absences 9.3591688
varImpPlot(rf2, sort=TRUE, n.var=8,main="Variable Importance Plot Portuguese")
