d1=read.table("C:/Users/david/Downloads/student+performance/student/student-mat.csv", sep=";",header=TRUE)
d2=read.table("C:/Users/david/Downloads/student+performance/student/student-por.csv", sep=";",header=TRUE)
View(d1)
View(d2)
library(dplyr)
## 
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(car)
## 载入需要的程辑包:carData
## 
## 载入程辑包:'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## 载入程辑包:'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(MASS)
## 
## 载入程辑包:'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(leaps)
library(ggplot2)
## 
## 载入程辑包:'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
library(caret)
## 载入需要的程辑包:lattice
#Mathematic data
d11 <- d1[,-c(31,32)]
m1 <- lm(G3~school+sex+age+address+famsize+Pstatus+Medu+Fedu+Mjob+Fjob+reason+guardian+traveltime+studytime+failures+schoolsup+paid+activities+nursery+higher+internet+romantic+famrel+freetime+goout+Dalc+Walc+health+absences, data=d11)
cooksD <- cooks.distance(m1)
influential <- cooksD[(cooksD>(3*mean(cooksD, na.rm=TRUE)))]
names_of_influential <- names(influential)
df <- d1[names_of_influential,]
d1_clean <- d1 %>% anti_join(df)
## Joining with `by = join_by(school, sex, age, address, famsize, Pstatus, Medu,
## Fedu, Mjob, Fjob, reason, guardian, traveltime, studytime, failures, schoolsup,
## famsup, paid, activities, nursery, higher, internet, romantic, famrel,
## freetime, goout, Dalc, Walc, health, absences, G1, G2, G3)`
m1c <- lm(G3~school+sex+age+address+famsize+Pstatus+Medu+Fedu+Mjob+Fjob+reason+guardian+traveltime+studytime+failures+schoolsup+paid+activities+nursery+higher+internet+romantic+famrel+freetime+goout+Dalc+Walc+health+absences, data=d1_clean)
plot(m1c)

hist(d1_clean$G3, main="G3 Grades for Mathematics", xlab="G3")

#Stepwise selection
selectedMod <- step(m1c)
## Start:  AIC=828.97
## G3 ~ school + sex + age + address + famsize + Pstatus + Medu + 
##     Fedu + Mjob + Fjob + reason + guardian + traveltime + studytime + 
##     failures + schoolsup + paid + activities + nursery + higher + 
##     internet + romantic + famrel + freetime + goout + Dalc + 
##     Walc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - reason      3      1.82 2936.7 823.19
## - guardian    2     15.66 2950.5 826.87
## - Walc        1      0.03 2934.9 826.98
## - Pstatus     1      0.26 2935.1 827.01
## - freetime    1      0.42 2935.3 827.03
## - Fedu        1      0.62 2935.5 827.05
## - nursery     1      0.74 2935.6 827.06
## - absences    1      1.78 2936.6 827.19
## - school      1      1.79 2936.6 827.19
## - activities  1      2.42 2937.3 827.27
## - Medu        1      3.34 2938.2 827.38
## - traveltime  1      3.70 2938.5 827.42
## - famrel      1      4.18 2939.0 827.48
## - internet    1      5.26 2940.1 827.61
## - Dalc        1      5.77 2940.6 827.67
## - address     1     10.26 2945.1 828.22
## - romantic    1     13.24 2948.1 828.58
## - famsize     1     15.65 2950.5 828.87
## - paid        1     15.75 2950.6 828.88
## <none>                    2934.8 828.97
## - higher      1     31.92 2966.8 830.82
## - age         1     35.20 2970.0 831.22
## - Fjob        4     85.75 3020.6 831.23
## - health      1     41.70 2976.5 832.00
## - sex         1     61.12 2996.0 834.31
## - studytime   1     73.98 3008.8 835.84
## - goout       1     75.68 3010.5 836.04
## - Mjob        4    176.98 3111.8 841.82
## - schoolsup   1    140.07 3074.9 843.57
## - failures    1    524.63 3459.5 885.52
## 
## Step:  AIC=823.19
## G3 ~ school + sex + age + address + famsize + Pstatus + Medu + 
##     Fedu + Mjob + Fjob + guardian + traveltime + studytime + 
##     failures + schoolsup + paid + activities + nursery + higher + 
##     internet + romantic + famrel + freetime + goout + Dalc + 
##     Walc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - guardian    2     16.29 2952.9 821.16
## - Walc        1      0.03 2936.7 821.20
## - Pstatus     1      0.24 2936.9 821.22
## - Fedu        1      0.46 2937.1 821.25
## - freetime    1      0.48 2937.1 821.25
## - nursery     1      0.78 2937.4 821.29
## - absences    1      1.74 2938.4 821.40
## - school      1      2.08 2938.7 821.45
## - activities  1      2.09 2938.7 821.45
## - Medu        1      3.45 2940.1 821.61
## - traveltime  1      3.85 2940.5 821.66
## - famrel      1      4.34 2941.0 821.72
## - internet    1      5.19 2941.8 821.82
## - Dalc        1      6.69 2943.3 822.00
## - address     1     10.05 2946.7 822.41
## - romantic    1     12.45 2949.1 822.70
## - famsize     1     15.37 2952.0 823.05
## - paid        1     15.40 2952.1 823.06
## <none>                    2936.7 823.19
## - higher      1     30.54 2967.2 824.88
## - age         1     36.11 2972.8 825.55
## - Fjob        4     90.11 3026.8 825.95
## - health      1     44.05 2980.7 826.49
## - sex         1     60.09 2996.7 828.40
## - studytime   1     74.47 3011.1 830.11
## - goout       1     77.80 3014.5 830.50
## - Mjob        4    188.51 3125.2 837.34
## - schoolsup   1    141.19 3077.8 837.91
## - failures    1    530.28 3466.9 880.29
## 
## Step:  AIC=821.16
## G3 ~ school + sex + age + address + famsize + Pstatus + Medu + 
##     Fedu + Mjob + Fjob + traveltime + studytime + failures + 
##     schoolsup + paid + activities + nursery + higher + internet + 
##     romantic + famrel + freetime + goout + Dalc + Walc + health + 
##     absences
## 
##              Df Sum of Sq    RSS    AIC
## - Walc        1      0.10 2953.0 819.17
## - Pstatus     1      0.45 2953.4 819.22
## - Fedu        1      1.29 2954.2 819.32
## - freetime    1      1.34 2954.3 819.32
## - nursery     1      1.52 2954.5 819.35
## - school      1      1.81 2954.7 819.38
## - activities  1      2.19 2955.1 819.43
## - traveltime  1      2.28 2955.2 819.44
## - Medu        1      2.40 2955.3 819.45
## - absences    1      3.43 2956.4 819.58
## - internet    1      5.10 2958.0 819.78
## - famrel      1      5.49 2958.4 819.82
## - Dalc        1      6.87 2959.8 819.99
## - romantic    1     11.94 2964.9 820.60
## - address     1     12.92 2965.9 820.72
## - famsize     1     13.80 2966.7 820.82
## - paid        1     14.46 2967.4 820.90
## <none>                    2952.9 821.16
## - age         1     26.23 2979.2 822.31
## - higher      1     37.68 2990.6 823.68
## - Fjob        4     88.73 3041.7 823.70
## - health      1     46.83 2999.8 824.76
## - sex         1     61.73 3014.7 826.53
## - studytime   1     80.54 3033.5 828.74
## - goout       1     90.09 3043.0 829.86
## - Mjob        4    187.38 3140.3 835.07
## - schoolsup   1    139.17 3092.1 835.56
## - failures    1    521.00 3473.9 877.01
## 
## Step:  AIC=819.17
## G3 ~ school + sex + age + address + famsize + Pstatus + Medu + 
##     Fedu + Mjob + Fjob + traveltime + studytime + failures + 
##     schoolsup + paid + activities + nursery + higher + internet + 
##     romantic + famrel + freetime + goout + Dalc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - Pstatus     1      0.46 2953.5 817.23
## - Fedu        1      1.24 2954.3 817.32
## - freetime    1      1.38 2954.4 817.34
## - nursery     1      1.46 2954.5 817.35
## - school      1      1.85 2954.9 817.40
## - activities  1      2.17 2955.2 817.44
## - traveltime  1      2.27 2955.3 817.45
## - Medu        1      2.53 2955.6 817.48
## - absences    1      3.36 2956.4 817.58
## - internet    1      5.12 2958.2 817.79
## - famrel      1      5.91 2958.9 817.89
## - Dalc        1      8.66 2961.7 818.22
## - romantic    1     11.93 2965.0 818.61
## - address     1     13.03 2966.1 818.74
## - famsize     1     13.71 2966.8 818.82
## - paid        1     14.72 2967.8 818.94
## <none>                    2953.0 819.17
## - age         1     26.34 2979.4 820.34
## - higher      1     37.72 2990.8 821.69
## - Fjob        4     92.14 3045.2 822.11
## - health      1     47.38 3000.4 822.84
## - sex         1     61.94 3015.0 824.56
## - studytime   1     82.21 3035.2 826.95
## - goout       1    106.71 3059.7 829.81
## - Mjob        4    187.59 3140.6 833.10
## - schoolsup   1    139.07 3092.1 833.56
## - failures    1    521.35 3474.4 875.05
## 
## Step:  AIC=817.23
## G3 ~ school + sex + age + address + famsize + Medu + Fedu + Mjob + 
##     Fjob + traveltime + studytime + failures + schoolsup + paid + 
##     activities + nursery + higher + internet + romantic + famrel + 
##     freetime + goout + Dalc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - Fedu        1      1.31 2954.8 815.39
## - freetime    1      1.33 2954.8 815.39
## - nursery     1      1.40 2954.9 815.40
## - school      1      1.85 2955.4 815.45
## - traveltime  1      2.26 2955.8 815.50
## - activities  1      2.40 2955.9 815.52
## - Medu        1      2.73 2956.2 815.56
## - absences    1      3.77 2957.3 815.68
## - internet    1      4.90 2958.4 815.82
## - famrel      1      5.99 2959.5 815.95
## - Dalc        1      8.78 2962.3 816.29
## - romantic    1     11.77 2965.3 816.65
## - address     1     12.99 2966.5 816.79
## - paid        1     15.02 2968.5 817.04
## - famsize     1     15.07 2968.6 817.04
## <none>                    2953.5 817.23
## - age         1     27.14 2980.6 818.49
## - higher      1     37.84 2991.3 819.76
## - Fjob        4     92.70 3046.2 820.23
## - health      1     48.01 3001.5 820.97
## - sex         1     61.73 3015.2 822.59
## - studytime   1     82.53 3036.0 825.04
## - goout       1    106.49 3060.0 827.84
## - Mjob        4    188.52 3142.0 831.26
## - schoolsup   1    138.99 3092.5 831.60
## - failures    1    521.00 3474.5 873.07
## 
## Step:  AIC=815.39
## G3 ~ school + sex + age + address + famsize + Medu + Mjob + Fjob + 
##     traveltime + studytime + failures + schoolsup + paid + activities + 
##     nursery + higher + internet + romantic + famrel + freetime + 
##     goout + Dalc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - freetime    1      1.25 2956.1 813.54
## - nursery     1      1.31 2956.1 813.55
## - school      1      1.90 2956.7 813.62
## - activities  1      2.18 2957.0 813.65
## - traveltime  1      2.54 2957.3 813.69
## - absences    1      3.60 2958.4 813.82
## - internet    1      5.39 2960.2 814.04
## - famrel      1      6.00 2960.8 814.11
## - Medu        1      6.39 2961.2 814.16
## - Dalc        1      8.57 2963.4 814.42
## - romantic    1     11.64 2966.5 814.79
## - address     1     12.77 2967.6 814.92
## - famsize     1     14.76 2969.6 815.16
## - paid        1     14.89 2969.7 815.18
## <none>                    2954.8 815.39
## - age         1     27.01 2981.8 816.63
## - higher      1     39.88 2994.7 818.16
## - health      1     47.22 3002.0 819.03
## - Fjob        4    106.85 3061.7 820.03
## - sex         1     61.64 3016.5 820.74
## - studytime   1     81.38 3036.2 823.06
## - goout       1    105.64 3060.4 825.89
## - Mjob        4    187.37 3142.2 829.28
## - schoolsup   1    138.22 3093.0 829.66
## - failures    1    536.22 3491.0 872.76
## 
## Step:  AIC=813.54
## G3 ~ school + sex + age + address + famsize + Medu + Mjob + Fjob + 
##     traveltime + studytime + failures + schoolsup + paid + activities + 
##     nursery + higher + internet + romantic + famrel + goout + 
##     Dalc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - nursery     1      1.42 2957.5 811.71
## - activities  1      2.00 2958.1 811.78
## - school      1      2.12 2958.2 811.79
## - traveltime  1      2.70 2958.8 811.86
## - absences    1      3.51 2959.6 811.96
## - internet    1      5.27 2961.3 812.17
## - Medu        1      6.00 2962.1 812.26
## - famrel      1      7.19 2963.3 812.40
## - Dalc        1      9.52 2965.6 812.68
## - romantic    1     11.29 2967.4 812.90
## - address     1     12.97 2969.0 813.10
## - famsize     1     14.57 2970.6 813.29
## - paid        1     15.63 2971.7 813.42
## <none>                    2956.1 813.54
## - age         1     28.66 2984.7 814.97
## - higher      1     40.14 2996.2 816.34
## - health      1     46.96 3003.0 817.15
## - Fjob        4    106.78 3062.9 818.17
## - sex         1     64.26 3020.3 819.20
## - studytime   1     80.50 3036.6 821.10
## - goout       1    107.01 3063.1 824.20
## - Mjob        4    186.40 3142.5 827.31
## - schoolsup   1    140.40 3096.5 828.06
## - failures    1    535.33 3491.4 870.79
## 
## Step:  AIC=811.71
## G3 ~ school + sex + age + address + famsize + Medu + Mjob + Fjob + 
##     traveltime + studytime + failures + schoolsup + paid + activities + 
##     higher + internet + romantic + famrel + goout + Dalc + health + 
##     absences
## 
##              Df Sum of Sq    RSS    AIC
## - activities  1      1.87 2959.4 809.94
## - school      1      2.40 2959.9 810.00
## - traveltime  1      2.93 2960.4 810.06
## - absences    1      3.49 2961.0 810.13
## - Medu        1      5.60 2963.1 810.38
## - internet    1      5.62 2963.1 810.39
## - famrel      1      7.00 2964.5 810.55
## - Dalc        1     10.37 2967.9 810.96
## - romantic    1     11.99 2969.5 811.15
## - address     1     12.80 2970.3 811.25
## - famsize     1     13.68 2971.2 811.35
## - paid        1     16.30 2973.8 811.67
## <none>                    2957.5 811.71
## - age         1     28.52 2986.0 813.13
## - higher      1     39.72 2997.2 814.46
## - health      1     46.94 3004.4 815.32
## - Fjob        4    106.44 3063.9 816.30
## - sex         1     64.39 3021.9 817.38
## - studytime   1     79.78 3037.3 819.19
## - goout       1    109.85 3067.3 822.69
## - Mjob        4    185.69 3143.2 825.39
## - schoolsup   1    141.68 3099.2 826.37
## - failures    1    534.39 3491.9 868.84
## 
## Step:  AIC=809.94
## G3 ~ school + sex + age + address + famsize + Medu + Mjob + Fjob + 
##     traveltime + studytime + failures + schoolsup + paid + higher + 
##     internet + romantic + famrel + goout + Dalc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - school      1      2.78 2962.1 808.27
## - traveltime  1      3.05 2962.4 808.30
## - absences    1      3.47 2962.8 808.35
## - Medu        1      5.20 2964.6 808.56
## - internet    1      5.53 2964.9 808.60
## - famrel      1      6.92 2966.3 808.77
## - Dalc        1     11.60 2971.0 809.33
## - romantic    1     12.67 2972.0 809.46
## - famsize     1     13.62 2973.0 809.57
## - address     1     14.41 2973.8 809.66
## - paid        1     15.84 2975.2 809.84
## <none>                    2959.4 809.94
## - age         1     27.49 2986.8 811.23
## - higher      1     38.49 2997.8 812.54
## - health      1     47.45 3006.8 813.60
## - Fjob        4    105.83 3065.2 814.44
## - sex         1     62.53 3021.9 815.38
## - studytime   1     78.01 3037.4 817.20
## - goout       1    113.35 3072.7 821.32
## - Mjob        4    187.25 3146.6 823.78
## - schoolsup   1    143.11 3102.5 824.75
## - failures    1    533.88 3493.2 866.98
## 
## Step:  AIC=808.27
## G3 ~ sex + age + address + famsize + Medu + Mjob + Fjob + traveltime + 
##     studytime + failures + schoolsup + paid + higher + internet + 
##     romantic + famrel + goout + Dalc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - traveltime  1      2.13 2964.3 806.53
## - absences    1      2.43 2964.6 806.56
## - internet    1      5.03 2967.2 806.87
## - Medu        1      5.06 2967.2 806.88
## - famrel      1      6.45 2968.6 807.04
## - romantic    1     12.07 2974.2 807.72
## - Dalc        1     12.16 2974.3 807.73
## - address     1     12.72 2974.8 807.79
## - famsize     1     14.19 2976.3 807.97
## - paid        1     15.76 2977.9 808.16
## <none>                    2962.1 808.27
## - age         1     24.72 2986.9 809.23
## - higher      1     39.91 3002.0 811.03
## - health      1     47.94 3010.1 811.99
## - Fjob        4    107.59 3069.7 812.97
## - sex         1     61.35 3023.5 813.57
## - studytime   1     75.49 3037.6 815.23
## - goout       1    112.53 3074.7 819.54
## - Mjob        4    185.72 3147.8 821.92
## - schoolsup   1    145.10 3107.2 823.29
## - failures    1    543.96 3506.1 866.29
## 
## Step:  AIC=806.53
## G3 ~ sex + age + address + famsize + Medu + Mjob + Fjob + studytime + 
##     failures + schoolsup + paid + higher + internet + romantic + 
##     famrel + goout + Dalc + health + absences
## 
##             Df Sum of Sq    RSS    AIC
## - absences   1      2.43 2966.7 804.82
## - internet   1      4.82 2969.1 805.10
## - Medu       1      5.34 2969.6 805.17
## - famrel     1      6.75 2971.0 805.33
## - Dalc       1     11.41 2975.7 805.89
## - romantic   1     11.86 2976.1 805.95
## - famsize    1     13.16 2977.4 806.10
## - paid       1     15.56 2979.8 806.39
## - address    1     16.61 2980.9 806.51
## <none>                   2964.3 806.53
## - age        1     25.06 2989.3 807.52
## - higher     1     40.28 3004.5 809.33
## - health     1     47.52 3011.8 810.19
## - Fjob       4    108.57 3072.8 811.33
## - sex        1     61.13 3025.4 811.79
## - studytime  1     77.04 3041.3 813.66
## - goout      1    112.23 3076.5 817.76
## - Mjob       4    187.57 3151.8 820.37
## - schoolsup  1    144.38 3108.6 821.46
## - failures   1    543.41 3507.7 864.45
## 
## Step:  AIC=804.82
## G3 ~ sex + age + address + famsize + Medu + Mjob + Fjob + studytime + 
##     failures + schoolsup + paid + higher + internet + romantic + 
##     famrel + goout + Dalc + health
## 
##             Df Sum of Sq    RSS    AIC
## - internet   1      5.72 2972.4 803.50
## - famrel     1      6.24 2972.9 803.56
## - Medu       1      6.51 2973.2 803.60
## - romantic   1     11.06 2977.8 804.14
## - Dalc       1     11.76 2978.4 804.22
## - famsize    1     13.16 2979.9 804.39
## - paid       1     15.62 2982.3 804.69
## - address    1     16.05 2982.7 804.74
## <none>                   2966.7 804.82
## - age        1     22.90 2989.6 805.55
## - higher     1     39.40 3006.1 807.51
## - health     1     46.96 3013.7 808.41
## - Fjob       4    107.09 3073.8 809.44
## - sex        1     59.15 3025.8 809.84
## - studytime  1     75.28 3042.0 811.74
## - goout      1    113.04 3079.7 816.13
## - Mjob       4    187.06 3153.8 818.59
## - schoolsup  1    142.33 3109.0 819.50
## - failures   1    541.35 3508.0 862.49
## 
## Step:  AIC=803.5
## G3 ~ sex + age + address + famsize + Medu + Mjob + Fjob + studytime + 
##     failures + schoolsup + paid + higher + romantic + famrel + 
##     goout + Dalc + health
## 
##             Df Sum of Sq    RSS    AIC
## - Medu       1      6.54 2978.9 802.28
## - famrel     1      6.67 2979.1 802.30
## - romantic   1      9.67 2982.1 802.66
## - Dalc       1     12.20 2984.6 802.96
## - famsize    1     12.84 2985.2 803.04
## - paid       1     14.27 2986.7 803.21
## <none>                   2972.4 803.50
## - address    1     19.22 2991.6 803.80
## - age        1     25.24 2997.7 804.51
## - higher     1     37.59 3010.0 805.98
## - health     1     48.85 3021.3 807.30
## - Fjob       4    105.52 3077.9 807.92
## - sex        1     61.57 3034.0 808.80
## - studytime  1     78.74 3051.1 810.81
## - goout      1    109.60 3082.0 814.39
## - Mjob       4    190.29 3162.7 817.59
## - schoolsup  1    142.16 3114.6 818.13
## - failures   1    540.94 3513.4 861.02
## 
## Step:  AIC=802.28
## G3 ~ sex + age + address + famsize + Mjob + Fjob + studytime + 
##     failures + schoolsup + paid + higher + romantic + famrel + 
##     goout + Dalc + health
## 
##             Df Sum of Sq    RSS    AIC
## - famrel     1      7.26 2986.2 801.15
## - romantic   1      7.48 2986.4 801.18
## - famsize    1     10.74 2989.7 801.57
## - paid       1     14.01 2993.0 801.95
## - Dalc       1     14.05 2993.0 801.96
## <none>                   2978.9 802.28
## - address    1     20.58 2999.5 802.74
## - age        1     27.51 3006.5 803.56
## - higher     1     38.33 3017.3 804.84
## - health     1     54.82 3033.8 806.78
## - sex        1     60.07 3039.0 807.39
## - Fjob       4    114.99 3093.9 807.77
## - studytime  1     81.40 3060.3 809.88
## - goout      1    107.60 3086.6 812.92
## - schoolsup  1    142.39 3121.3 816.91
## - Mjob       4    216.89 3195.8 819.30
## - failures   1    563.27 3542.2 861.94
## 
## Step:  AIC=801.15
## G3 ~ sex + age + address + famsize + Mjob + Fjob + studytime + 
##     failures + schoolsup + paid + higher + romantic + goout + 
##     Dalc + health
## 
##             Df Sum of Sq    RSS    AIC
## - romantic   1      8.96 2995.2 800.22
## - famsize    1     10.18 2996.4 800.36
## - Dalc       1     11.83 2998.0 800.56
## - paid       1     13.65 2999.9 800.77
## <none>                   2986.2 801.15
## - address    1     22.50 3008.7 801.82
## - age        1     24.71 3010.9 802.08
## - higher     1     39.20 3025.4 803.79
## - health     1     50.49 3036.7 805.12
## - Fjob       4    111.63 3097.8 806.22
## - sex        1     65.30 3051.5 806.85
## - studytime  1     84.88 3071.1 809.13
## - goout      1    105.90 3092.1 811.56
## - schoolsup  1    139.26 3125.5 815.38
## - Mjob       4    215.45 3201.7 817.95
## - failures   1    573.18 3559.4 861.66
## 
## Step:  AIC=800.22
## G3 ~ sex + age + address + famsize + Mjob + Fjob + studytime + 
##     failures + schoolsup + paid + higher + goout + Dalc + health
## 
##             Df Sum of Sq    RSS    AIC
## - famsize    1      9.65 3004.8 799.36
## - Dalc       1     11.90 3007.1 799.63
## - paid       1     14.32 3009.5 799.92
## <none>                   2995.2 800.22
## - address    1     21.65 3016.8 800.78
## - age        1     30.02 3025.2 801.77
## - higher     1     43.41 3038.6 803.34
## - health     1     52.76 3047.9 804.43
## - Fjob       4    109.49 3104.7 805.00
## - sex        1     69.62 3064.8 806.40
## - studytime  1     82.12 3077.3 807.85
## - goout      1    105.16 3100.3 810.50
## - schoolsup  1    136.67 3131.8 814.10
## - Mjob       4    218.00 3213.2 817.23
## - failures   1    586.40 3581.6 861.87
## 
## Step:  AIC=799.36
## G3 ~ sex + age + address + Mjob + Fjob + studytime + failures + 
##     schoolsup + paid + higher + goout + Dalc + health
## 
##             Df Sum of Sq    RSS    AIC
## - Dalc       1     14.59 3019.4 799.09
## - paid       1     15.25 3020.1 799.16
## <none>                   3004.8 799.36
## - address    1     25.08 3029.9 800.32
## - age        1     27.92 3032.7 800.65
## - higher     1     43.97 3048.8 802.53
## - Fjob       4    104.33 3109.1 803.51
## - health     1     53.97 3058.8 803.70
## - sex        1     72.06 3076.9 805.80
## - studytime  1     79.32 3084.1 806.64
## - goout      1    107.99 3112.8 809.93
## - schoolsup  1    134.74 3139.5 812.98
## - Mjob       4    225.64 3230.4 817.14
## - failures   1    591.64 3596.4 861.35
## 
## Step:  AIC=799.09
## G3 ~ sex + age + address + Mjob + Fjob + studytime + failures + 
##     schoolsup + paid + higher + goout + health
## 
##             Df Sum of Sq    RSS    AIC
## - paid       1     11.05 3030.5 798.39
## <none>                   3019.4 799.09
## - address    1     20.74 3040.1 799.52
## - age        1     24.31 3043.7 799.94
## - higher     1     45.64 3065.0 802.43
## - health     1     50.91 3070.3 803.04
## - Fjob       4    109.18 3128.6 803.73
## - studytime  1     72.56 3092.0 805.54
## - sex        1     93.35 3112.7 807.93
## - goout      1     95.67 3115.1 808.19
## - schoolsup  1    134.63 3154.0 812.62
## - Mjob       4    216.43 3235.8 815.73
## - failures   1    578.39 3597.8 859.48
## 
## Step:  AIC=798.39
## G3 ~ sex + age + address + Mjob + Fjob + studytime + failures + 
##     schoolsup + higher + goout + health
## 
##             Df Sum of Sq    RSS    AIC
## <none>                   3030.5 798.39
## - address    1     20.38 3050.8 798.77
## - age        1     25.24 3055.7 799.34
## - higher     1     41.25 3071.7 801.20
## - health     1     49.17 3079.6 802.12
## - Fjob       4    118.35 3148.8 804.03
## - studytime  1     67.84 3098.3 804.27
## - goout      1     98.06 3128.5 807.72
## - sex        1    100.81 3131.3 808.04
## - schoolsup  1    132.89 3163.3 811.67
## - Mjob       4    211.43 3241.9 814.40
## - failures   1    568.57 3599.0 857.60
summary(selectedMod)
## 
## Call:
## lm(formula = G3 ~ sex + age + address + Mjob + Fjob + studytime + 
##     failures + schoolsup + higher + goout + health, data = d1_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.4922  -2.0420   0.0357   2.0262   6.7142 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.37165    2.94147   4.886 1.59e-06 ***
## sexM          1.17017    0.34898   3.353 0.000890 ***
## age          -0.24210    0.14431  -1.678 0.094334 .  
## addressU      0.60042    0.39821   1.508 0.132540    
## Mjobhealth    1.80672    0.73655   2.453 0.014674 *  
## Mjobother    -0.03889    0.52128  -0.075 0.940568    
## Mjobservices  1.64617    0.55377   2.973 0.003164 ** 
## Mjobteacher   0.06240    0.64052   0.097 0.922451    
## Fjobhealth   -0.31962    1.08923  -0.293 0.769369    
## Fjobother    -0.75014    0.77050  -0.974 0.330965    
## Fjobservices -0.14713    0.79696  -0.185 0.853637    
## Fjobteacher   1.54225    0.95766   1.610 0.108234    
## studytime     0.56950    0.20704   2.751 0.006267 ** 
## failures     -2.05304    0.25781  -7.963 2.57e-14 ***
## schoolsupyes -1.95247    0.50714  -3.850 0.000141 ***
## higheryes     1.83648    0.85619   2.145 0.032669 *  
## goout        -0.50138    0.15161  -3.307 0.001044 ** 
## health       -0.27391    0.11696  -2.342 0.019770 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.994 on 338 degrees of freedom
## Multiple R-squared:  0.3906, Adjusted R-squared:  0.3599 
## F-statistic: 12.74 on 17 and 338 DF,  p-value: < 2.2e-16
sigm1 <- lm(G3~sex+age+Mjob+studytime+failures+schoolsup+higher+goout+health,data=d1_clean)
summary(sigm1)
## 
## Call:
## lm(formula = G3 ~ sex + age + Mjob + studytime + failures + schoolsup + 
##     higher + goout + health, data = d1_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.6080  -1.9673   0.0355   2.2504   7.9983 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.513624   2.756777   5.265 2.48e-07 ***
## sexM          1.101512   0.353064   3.120 0.001963 ** 
## age          -0.248582   0.143486  -1.732 0.084093 .  
## Mjobhealth    2.013814   0.725710   2.775 0.005823 ** 
## Mjobother    -0.009995   0.522717  -0.019 0.984755    
## Mjobservices  1.966212   0.551694   3.564 0.000417 ***
## Mjobteacher   0.611118   0.631999   0.967 0.334245    
## studytime     0.518637   0.208002   2.493 0.013123 *  
## failures     -2.104745   0.260452  -8.081 1.11e-14 ***
## schoolsupyes -1.771131   0.507593  -3.489 0.000547 ***
## higheryes     1.831547   0.867072   2.112 0.035380 *  
## goout        -0.479519   0.152743  -3.139 0.001840 ** 
## health       -0.289778   0.118339  -2.449 0.014837 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.042 on 343 degrees of freedom
## Multiple R-squared:  0.3616, Adjusted R-squared:  0.3393 
## F-statistic: 16.19 on 12 and 343 DF,  p-value: < 2.2e-16
selected2Mod <- step(sigm1)
## Start:  AIC=804.92
## G3 ~ sex + age + Mjob + studytime + failures + schoolsup + higher + 
##     goout + health
## 
##             Df Sum of Sq    RSS    AIC
## <none>                   3174.5 804.92
## - age        1     27.78 3202.3 806.02
## - higher     1     41.30 3215.8 807.52
## - health     1     55.50 3230.0 809.09
## - studytime  1     57.54 3232.0 809.31
## - sex        1     90.08 3264.6 812.88
## - goout      1     91.22 3265.7 813.00
## - schoolsup  1    112.68 3287.2 815.33
## - Mjob       4    281.62 3456.1 827.18
## - failures   1    604.40 3778.9 864.96
summary(selected2Mod)
## 
## Call:
## lm(formula = G3 ~ sex + age + Mjob + studytime + failures + schoolsup + 
##     higher + goout + health, data = d1_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.6080  -1.9673   0.0355   2.2504   7.9983 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.513624   2.756777   5.265 2.48e-07 ***
## sexM          1.101512   0.353064   3.120 0.001963 ** 
## age          -0.248582   0.143486  -1.732 0.084093 .  
## Mjobhealth    2.013814   0.725710   2.775 0.005823 ** 
## Mjobother    -0.009995   0.522717  -0.019 0.984755    
## Mjobservices  1.966212   0.551694   3.564 0.000417 ***
## Mjobteacher   0.611118   0.631999   0.967 0.334245    
## studytime     0.518637   0.208002   2.493 0.013123 *  
## failures     -2.104745   0.260452  -8.081 1.11e-14 ***
## schoolsupyes -1.771131   0.507593  -3.489 0.000547 ***
## higheryes     1.831547   0.867072   2.112 0.035380 *  
## goout        -0.479519   0.152743  -3.139 0.001840 ** 
## health       -0.289778   0.118339  -2.449 0.014837 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.042 on 343 degrees of freedom
## Multiple R-squared:  0.3616, Adjusted R-squared:  0.3393 
## F-statistic: 16.19 on 12 and 343 DF,  p-value: < 2.2e-16
sigm2 <- lm(G3~sex+Mjob+studytime+failures+schoolsup+higher+goout+health, data=d1_clean)
summary(sigm2)
## 
## Call:
## lm(formula = G3 ~ sex + Mjob + studytime + failures + schoolsup + 
##     higher + goout + health, data = d1_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.6864  -2.0401   0.0763   2.1698   8.0103 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.16573    1.14406   8.886  < 2e-16 ***
## sexM          1.13657    0.35351   3.215 0.001427 ** 
## Mjobhealth    2.15868    0.72297   2.986 0.003031 ** 
## Mjobother     0.01312    0.52406   0.025 0.980041    
## Mjobservices  2.02633    0.55220   3.670 0.000282 ***
## Mjobteacher   0.65868    0.63324   1.040 0.298989    
## studytime     0.50367    0.20843   2.417 0.016188 *  
## failures     -2.18818    0.25670  -8.524 4.92e-16 ***
## schoolsupyes -1.54185    0.49146  -3.137 0.001852 ** 
## higheryes     2.08059    0.85756   2.426 0.015772 *  
## goout        -0.50902    0.15223  -3.344 0.000918 ***
## health       -0.28117    0.11858  -2.371 0.018280 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.051 on 344 degrees of freedom
## Multiple R-squared:  0.356,  Adjusted R-squared:  0.3355 
## F-statistic: 17.29 on 11 and 344 DF,  p-value: < 2.2e-16
#best subsets selection
allsubs <- regsubsets(G3~., data=d1_clean)
summary(allsubs)
## Subset selection object
## Call: regsubsets.formula(G3 ~ ., data = d1_clean)
## 41 Variables  (and intercept)
##                  Forced in Forced out
## schoolMS             FALSE      FALSE
## sexM                 FALSE      FALSE
## age                  FALSE      FALSE
## addressU             FALSE      FALSE
## famsizeLE3           FALSE      FALSE
## PstatusT             FALSE      FALSE
## Medu                 FALSE      FALSE
## Fedu                 FALSE      FALSE
## Mjobhealth           FALSE      FALSE
## Mjobother            FALSE      FALSE
## Mjobservices         FALSE      FALSE
## Mjobteacher          FALSE      FALSE
## Fjobhealth           FALSE      FALSE
## Fjobother            FALSE      FALSE
## Fjobservices         FALSE      FALSE
## Fjobteacher          FALSE      FALSE
## reasonhome           FALSE      FALSE
## reasonother          FALSE      FALSE
## reasonreputation     FALSE      FALSE
## guardianmother       FALSE      FALSE
## guardianother        FALSE      FALSE
## traveltime           FALSE      FALSE
## studytime            FALSE      FALSE
## failures             FALSE      FALSE
## schoolsupyes         FALSE      FALSE
## famsupyes            FALSE      FALSE
## paidyes              FALSE      FALSE
## activitiesyes        FALSE      FALSE
## nurseryyes           FALSE      FALSE
## higheryes            FALSE      FALSE
## internetyes          FALSE      FALSE
## romanticyes          FALSE      FALSE
## famrel               FALSE      FALSE
## freetime             FALSE      FALSE
## goout                FALSE      FALSE
## Dalc                 FALSE      FALSE
## Walc                 FALSE      FALSE
## health               FALSE      FALSE
## absences             FALSE      FALSE
## G1                   FALSE      FALSE
## G2                   FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          schoolMS sexM age addressU famsizeLE3 PstatusT Medu Fedu Mjobhealth
## 1  ( 1 ) " "      " "  " " " "      " "        " "      " "  " "  " "       
## 2  ( 1 ) " "      " "  " " " "      " "        " "      " "  " "  " "       
## 3  ( 1 ) " "      " "  " " " "      " "        " "      " "  " "  " "       
## 4  ( 1 ) " "      " "  " " " "      " "        " "      " "  " "  " "       
## 5  ( 1 ) " "      " "  " " " "      " "        " "      " "  " "  " "       
## 6  ( 1 ) " "      " "  " " " "      " "        " "      " "  " "  " "       
## 7  ( 1 ) " "      " "  " " " "      " "        " "      " "  " "  " "       
## 8  ( 1 ) " "      " "  " " " "      " "        " "      " "  " "  " "       
##          Mjobother Mjobservices Mjobteacher Fjobhealth Fjobother Fjobservices
## 1  ( 1 ) " "       " "          " "         " "        " "       " "         
## 2  ( 1 ) " "       " "          " "         " "        " "       " "         
## 3  ( 1 ) " "       " "          " "         " "        " "       " "         
## 4  ( 1 ) " "       " "          " "         " "        " "       " "         
## 5  ( 1 ) " "       " "          " "         " "        " "       " "         
## 6  ( 1 ) " "       " "          " "         " "        " "       " "         
## 7  ( 1 ) "*"       " "          " "         " "        " "       " "         
## 8  ( 1 ) "*"       " "          " "         " "        " "       " "         
##          Fjobteacher reasonhome reasonother reasonreputation guardianmother
## 1  ( 1 ) " "         " "        " "         " "              " "           
## 2  ( 1 ) " "         " "        " "         " "              " "           
## 3  ( 1 ) " "         " "        " "         " "              " "           
## 4  ( 1 ) " "         " "        " "         " "              " "           
## 5  ( 1 ) " "         " "        " "         " "              " "           
## 6  ( 1 ) " "         " "        " "         " "              " "           
## 7  ( 1 ) " "         " "        " "         " "              " "           
## 8  ( 1 ) " "         " "        " "         " "              " "           
##          guardianother traveltime studytime failures schoolsupyes famsupyes
## 1  ( 1 ) " "           " "        " "       " "      " "          " "      
## 2  ( 1 ) " "           " "        " "       "*"      " "          " "      
## 3  ( 1 ) " "           " "        " "       "*"      " "          " "      
## 4  ( 1 ) " "           " "        " "       "*"      " "          " "      
## 5  ( 1 ) " "           " "        " "       "*"      " "          " "      
## 6  ( 1 ) " "           " "        " "       "*"      " "          " "      
## 7  ( 1 ) " "           " "        " "       "*"      " "          " "      
## 8  ( 1 ) " "           " "        " "       "*"      " "          " "      
##          paidyes activitiesyes nurseryyes higheryes internetyes romanticyes
## 1  ( 1 ) " "     " "           " "        " "       " "         " "        
## 2  ( 1 ) " "     " "           " "        " "       " "         " "        
## 3  ( 1 ) " "     " "           " "        " "       " "         " "        
## 4  ( 1 ) " "     " "           " "        " "       " "         " "        
## 5  ( 1 ) " "     " "           " "        " "       " "         "*"        
## 6  ( 1 ) " "     " "           " "        " "       " "         "*"        
## 7  ( 1 ) " "     " "           " "        " "       " "         "*"        
## 8  ( 1 ) " "     "*"           " "        " "       " "         "*"        
##          famrel freetime goout Dalc Walc health absences G1  G2 
## 1  ( 1 ) " "    " "      " "   " "  " "  " "    " "      " " "*"
## 2  ( 1 ) " "    " "      " "   " "  " "  " "    " "      " " "*"
## 3  ( 1 ) "*"    " "      " "   " "  " "  " "    " "      " " "*"
## 4  ( 1 ) "*"    " "      " "   " "  " "  " "    "*"      " " "*"
## 5  ( 1 ) "*"    " "      " "   " "  " "  " "    "*"      " " "*"
## 6  ( 1 ) "*"    " "      " "   " "  " "  " "    "*"      "*" "*"
## 7  ( 1 ) "*"    " "      " "   " "  " "  " "    "*"      "*" "*"
## 8  ( 1 ) "*"    " "      " "   " "  " "  " "    "*"      "*" "*"
sigm3 <- lm(G3~Mjob+Fjob+studytime+failures+schoolsup+famsup+goout,data=d1_clean)
summary(sigm3)
## 
## Call:
## lm(formula = G3 ~ Mjob + Fjob + studytime + failures + schoolsup + 
##     famsup + goout, data = d1_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.0968  -1.8787   0.1611   1.8979   7.3466 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  12.17157    1.03148  11.800  < 2e-16 ***
## Mjobhealth    2.50274    0.73150   3.421 0.000698 ***
## Mjobother     0.29912    0.51776   0.578 0.563832    
## Mjobservices  2.19123    0.54754   4.002  7.7e-05 ***
## Mjobteacher   0.83011    0.63198   1.313 0.189896    
## Fjobhealth   -0.05973    1.10010  -0.054 0.956728    
## Fjobother    -0.46987    0.77893  -0.603 0.546763    
## Fjobservices -0.05002    0.80831  -0.062 0.950693    
## Fjobteacher   1.84596    0.97359   1.896 0.058800 .  
## studytime     0.53893    0.20379   2.645 0.008556 ** 
## failures     -2.29565    0.25368  -9.049  < 2e-16 ***
## schoolsupyes -1.69588    0.49422  -3.431 0.000674 ***
## famsupyes    -1.01961    0.34523  -2.953 0.003360 ** 
## goout        -0.48114    0.15209  -3.163 0.001699 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.051 on 342 degrees of freedom
## Multiple R-squared:  0.3597, Adjusted R-squared:  0.3354 
## F-statistic: 14.78 on 13 and 342 DF,  p-value: < 2.2e-16
summ <- summary(sigm3)  # model summary
pvals <- summ[[4]][, 4]  # get all p values
significant <- character()  # init variables that aren't statsitically significant
significant <- names(which(pvals < 0.5))
significant <- significant[!significant %in% "(Intercept)"]
show(significant)
## [1] "Mjobhealth"   "Mjobservices" "Mjobteacher"  "Fjobteacher"  "studytime"   
## [6] "failures"     "schoolsupyes" "famsupyes"    "goout"
#Random Forest method

##10-fold CV
nfolds <- 10
fold=createFolds(1:nrow(d1_clean), k=nfolds, list=FALSE)
mlist <- c(5,6,7,8,9,10)
msetemp <- rep(NA, 10)
mselist <- rep(NA, 6)
for(i in 1:length(mlist)){
  mvalue <- mlist[i]
  for(j in 1:nfolds){
    pred_rf <- randomForest(G3~., data=d1_clean[fold==j,], mtry=mvalue, ntree=2000, importance=TRUE)
    msetemp[j] <- mean((d1_clean[fold==j,31]-predict(pred_rf, d1_clean[fold==j,], type="response"))^2)
  }
  mselist[i] = mean(msetemp)
}
head(mselist)
## [1] 1.948078 1.886288 1.841493 1.799659 1.821110 1.807417
rf <- randomForest(G3~., data=d1_clean, mtry=10, ntree=2000, importance=TRUE)

length(rf$predicted)
## [1] 356
length(d1_clean$G3)
## [1] 356
plot(d1_clean$G3, predict(rf, d1_clean, type="response"), xlab="actual observation", ylab="Predicted", main="Random Forest Prediction on Mathematics")

varImpPlot(rf, sort=TRUE, n.var=8, main="Variable Importance Plot Mathematics")

#Portuguese data
##Fit a full linear regression model
d22 <- d2[,-c(31,32)]
m2 <- lm(G3~., data=d22)
summary(m2)
## 
## Call:
## lm(formula = G3 ~ ., data = d22)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.8142  -1.3859   0.0094   1.5635   7.6487 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       8.68148    1.98532   4.373 1.44e-05 ***
## schoolMS         -1.20033    0.26732  -4.490 8.51e-06 ***
## sexM             -0.63306    0.25002  -2.532 0.011590 *  
## age               0.15616    0.10219   1.528 0.127000    
## addressU          0.32272    0.26181   1.233 0.218192    
## famsizeLE3        0.30253    0.24502   1.235 0.217426    
## PstatusT          0.17687    0.34669   0.510 0.610113    
## Medu              0.03528    0.15134   0.233 0.815770    
## Fedu              0.16686    0.13776   1.211 0.226295    
## Mjobhealth        0.90149    0.53751   1.677 0.094023 .  
## Mjobother         0.05042    0.30293   0.166 0.867868    
## Mjobservices      0.42055    0.37309   1.127 0.260104    
## Mjobteacher       0.51183    0.50191   1.020 0.308250    
## Fjobhealth       -0.61218    0.75234  -0.814 0.416136    
## Fjobother        -0.18438    0.45619  -0.404 0.686228    
## Fjobservices     -0.64339    0.47923  -1.343 0.179916    
## Fjobteacher       0.57968    0.67224   0.862 0.388854    
## reasonhome        0.05052    0.28491   0.177 0.859323    
## reasonother      -0.43494    0.36763  -1.183 0.237232    
## reasonreputation  0.21767    0.29800   0.730 0.465403    
## guardianmother   -0.33847    0.26516  -1.276 0.202271    
## guardianother     0.10499    0.53168   0.197 0.843529    
## traveltime        0.06249    0.15915   0.393 0.694707    
## studytime         0.40668    0.13994   2.906 0.003793 ** 
## failures         -1.41221    0.20450  -6.906 1.26e-11 ***
## schoolsupyes     -1.31116    0.36405  -3.602 0.000342 ***
## famsupyes        -0.02037    0.22829  -0.089 0.928938    
## paidyes          -0.37159    0.46142  -0.805 0.420957    
## activitiesyes     0.21915    0.22341   0.981 0.327000    
## nurseryyes       -0.21605    0.27139  -0.796 0.426291    
## higheryes         1.73300    0.38274   4.528 7.17e-06 ***
## internetyes       0.25287    0.27631   0.915 0.360465    
## romanticyes      -0.43156    0.22922  -1.883 0.060217 .  
## famrel            0.16155    0.11612   1.391 0.164640    
## freetime         -0.13777    0.11234  -1.226 0.220520    
## goout            -0.06606    0.10748  -0.615 0.539012    
## Dalc             -0.20478    0.15306  -1.338 0.181426    
## Walc             -0.08148    0.11846  -0.688 0.491824    
## health           -0.18745    0.07720  -2.428 0.015468 *  
## absences         -0.03807    0.02486  -1.531 0.126295    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.665 on 609 degrees of freedom
## Multiple R-squared:  0.3603, Adjusted R-squared:  0.3194 
## F-statistic: 8.797 on 39 and 609 DF,  p-value: < 2.2e-16
plot(m2)

cooksD <- cooks.distance(m2)
influential <- cooksD[(cooksD>(3*mean(cooksD, na.rm=TRUE)))]
names_of_influential <- names(influential)
df <- d22[names_of_influential,]
d2_clean <- d22 %>% anti_join(df)
## Joining with `by = join_by(school, sex, age, address, famsize, Pstatus, Medu,
## Fedu, Mjob, Fjob, reason, guardian, traveltime, studytime, failures, schoolsup,
## famsup, paid, activities, nursery, higher, internet, romantic, famrel,
## freetime, goout, Dalc, Walc, health, absences, G3)`
m2c <- lm(G3~., data=d2_clean)
plot(m2c)

response_df2 <- d2_clean['G3']  # Y variable
predictors_df2 <- d2_clean[, !names(d2_clean) %in% "G3" ]  # X variables
hist(d2_clean$G3, main="G3 Grades for Portuguese", xlab="G3")

#Stepwise selection
selectedMod21 <- step(m2c)
## Start:  AIC=821.02
## G3 ~ school + sex + age + address + famsize + Pstatus + Medu + 
##     Fedu + Mjob + Fjob + reason + guardian + traveltime + studytime + 
##     failures + schoolsup + famsup + paid + activities + nursery + 
##     higher + internet + romantic + famrel + freetime + goout + 
##     Dalc + Walc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - Mjob        4    13.282 2072.4 816.91
## - guardian    2     6.425 2065.6 818.91
## - traveltime  1     0.004 2059.2 819.02
## - Pstatus     1     0.035 2059.2 819.03
## - nursery     1     0.088 2059.3 819.05
## - address     1     0.547 2059.7 819.18
## - Walc        1     1.138 2060.3 819.36
## - famsup      1     1.196 2060.4 819.37
## - Fedu        1     3.589 2062.8 820.08
## - famsize     1     3.858 2063.0 820.15
## - reason      3    18.831 2078.0 820.53
## - Dalc        1     5.160 2064.3 820.54
## <none>                    2059.2 821.02
## - goout       1     8.630 2067.8 821.55
## - internet    1     8.931 2068.1 821.64
## - Medu        1     9.333 2068.5 821.76
## - romantic    1    10.285 2069.5 822.04
## - freetime    1    11.961 2071.1 822.53
## - paid        1    13.799 2073.0 823.06
## - activities  1    14.984 2074.2 823.41
## - health      1    21.990 2081.2 825.45
## - Fjob        4    44.122 2103.3 825.85
## - famrel      1    29.186 2088.4 827.54
## - age         1    42.909 2102.1 831.50
## - sex         1    45.768 2104.9 832.32
## - school      1    50.819 2110.0 833.77
## - studytime   1    56.520 2115.7 835.40
## - absences    1    58.886 2118.1 836.08
## - higher      1    99.347 2158.5 847.53
## - schoolsup   1   122.136 2181.3 853.88
## - failures    1   214.788 2274.0 879.05
## 
## Step:  AIC=816.91
## G3 ~ school + sex + age + address + famsize + Pstatus + Medu + 
##     Fedu + Fjob + reason + guardian + traveltime + studytime + 
##     failures + schoolsup + famsup + paid + activities + nursery + 
##     higher + internet + romantic + famrel + freetime + goout + 
##     Dalc + Walc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - traveltime  1     0.001 2072.4 814.91
## - Pstatus     1     0.044 2072.5 814.92
## - nursery     1     0.056 2072.5 814.93
## - guardian    2     7.388 2079.8 815.06
## - Walc        1     0.821 2073.3 815.15
## - address     1     0.845 2073.3 815.16
## - famsup      1     1.176 2073.6 815.26
## - Fedu        1     2.394 2074.8 815.61
## - famsize     1     3.938 2076.4 816.06
## - Dalc        1     6.670 2079.1 816.86
## <none>                    2072.4 816.91
## - goout       1     7.782 2080.2 817.18
## - reason      3    21.933 2094.4 817.28
## - romantic    1     9.528 2082.0 817.69
## - internet    1    11.255 2083.7 818.19
## - freetime    1    11.357 2083.8 818.22
## - activities  1    14.564 2087.0 819.15
## - paid        1    14.926 2087.4 819.25
## - health      1    19.635 2092.1 820.62
## - Fjob        4    42.997 2115.4 821.34
## - Medu        1    24.986 2097.4 822.16
## - famrel      1    26.158 2098.6 822.50
## - age         1    40.643 2113.1 826.66
## - sex         1    43.207 2115.7 827.40
## - school      1    52.476 2124.9 830.04
## - studytime   1    53.704 2126.2 830.39
## - absences    1    65.779 2138.2 833.82
## - higher      1   104.772 2177.2 844.75
## - schoolsup   1   128.366 2200.8 851.27
## - failures    1   210.746 2283.2 873.50
## 
## Step:  AIC=814.91
## G3 ~ school + sex + age + address + famsize + Pstatus + Medu + 
##     Fedu + Fjob + reason + guardian + studytime + failures + 
##     schoolsup + famsup + paid + activities + nursery + higher + 
##     internet + romantic + famrel + freetime + goout + Dalc + 
##     Walc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - Pstatus     1     0.044 2072.5 812.92
## - nursery     1     0.056 2072.5 812.93
## - guardian    2     7.387 2079.8 813.06
## - Walc        1     0.822 2073.3 813.15
## - address     1     0.907 2073.4 813.18
## - famsup      1     1.175 2073.6 813.26
## - Fedu        1     2.410 2074.9 813.62
## - famsize     1     3.937 2076.4 814.06
## - Dalc        1     6.676 2079.1 814.86
## <none>                    2072.4 814.91
## - goout       1     7.826 2080.3 815.19
## - reason      3    22.104 2094.6 815.33
## - romantic    1     9.527 2082.0 815.69
## - internet    1    11.376 2083.8 816.22
## - freetime    1    11.408 2083.9 816.23
## - activities  1    14.567 2087.0 817.15
## - paid        1    14.944 2087.4 817.26
## - health      1    19.708 2092.2 818.64
## - Fjob        4    43.109 2115.6 819.37
## - Medu        1    25.294 2097.8 820.25
## - famrel      1    26.173 2098.6 820.50
## - age         1    40.758 2113.2 824.69
## - sex         1    43.900 2116.3 825.59
## - school      1    53.057 2125.5 828.21
## - studytime   1    53.749 2126.2 828.40
## - absences    1    65.852 2138.3 831.84
## - higher      1   104.850 2177.3 842.77
## - schoolsup   1   128.634 2201.1 849.34
## - failures    1   211.150 2283.6 871.61
## 
## Step:  AIC=812.92
## G3 ~ school + sex + age + address + famsize + Medu + Fedu + Fjob + 
##     reason + guardian + studytime + failures + schoolsup + famsup + 
##     paid + activities + nursery + higher + internet + romantic + 
##     famrel + freetime + goout + Dalc + Walc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - nursery     1     0.056 2072.6 810.94
## - guardian    2     7.659 2080.2 811.16
## - Walc        1     0.802 2073.3 811.16
## - address     1     0.878 2073.4 811.18
## - famsup      1     1.160 2073.7 811.26
## - Fedu        1     2.390 2074.9 811.62
## - famsize     1     3.982 2076.5 812.09
## - Dalc        1     6.691 2079.2 812.87
## <none>                    2072.5 812.92
## - goout       1     7.839 2080.3 813.21
## - reason      3    22.062 2094.6 813.33
## - romantic    1     9.632 2082.1 813.73
## - freetime    1    11.375 2083.9 814.24
## - internet    1    11.544 2084.0 814.29
## - activities  1    14.950 2087.4 815.27
## - paid        1    14.959 2087.5 815.28
## - health      1    19.728 2092.2 816.66
## - Fjob        4    43.071 2115.6 817.37
## - Medu        1    25.254 2097.8 818.25
## - famrel      1    26.284 2098.8 818.55
## - age         1    41.301 2113.8 822.86
## - sex         1    43.865 2116.4 823.60
## - school      1    53.290 2125.8 826.28
## - studytime   1    53.730 2126.2 826.41
## - absences    1    67.210 2139.7 830.23
## - higher      1   105.081 2177.6 840.85
## - schoolsup   1   128.706 2201.2 847.38
## - failures    1   211.260 2283.8 869.65
## 
## Step:  AIC=810.94
## G3 ~ school + sex + age + address + famsize + Medu + Fedu + Fjob + 
##     reason + guardian + studytime + failures + schoolsup + famsup + 
##     paid + activities + higher + internet + romantic + famrel + 
##     freetime + goout + Dalc + Walc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - guardian    2     7.642 2080.2 809.17
## - Walc        1     0.817 2073.4 809.18
## - address     1     0.874 2073.4 809.20
## - famsup      1     1.148 2073.7 809.28
## - Fedu        1     2.399 2074.9 809.64
## - famsize     1     4.121 2076.7 810.14
## - Dalc        1     6.716 2079.3 810.90
## <none>                    2072.6 810.94
## - goout       1     7.805 2080.4 811.22
## - reason      3    22.055 2094.6 811.35
## - romantic    1     9.645 2082.2 811.75
## - freetime    1    11.415 2084.0 812.26
## - internet    1    11.512 2084.1 812.29
## - paid        1    14.938 2087.5 813.29
## - activities  1    15.087 2087.6 813.33
## - health      1    19.694 2092.2 814.66
## - Fjob        4    43.048 2115.6 815.38
## - Medu        1    25.726 2098.3 816.40
## - famrel      1    26.355 2098.9 816.59
## - age         1    41.579 2114.1 820.96
## - sex         1    44.013 2116.6 821.65
## - school      1    53.234 2125.8 824.28
## - studytime   1    53.725 2126.3 824.42
## - absences    1    67.308 2139.9 828.28
## - higher      1   105.032 2177.6 838.85
## - schoolsup   1   128.684 2201.2 845.39
## - failures    1   212.796 2285.3 868.07
## 
## Step:  AIC=809.17
## G3 ~ school + sex + age + address + famsize + Medu + Fedu + Fjob + 
##     reason + studytime + failures + schoolsup + famsup + paid + 
##     activities + higher + internet + romantic + famrel + freetime + 
##     goout + Dalc + Walc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - Walc        1     0.818 2081.0 807.41
## - address     1     0.946 2081.1 807.44
## - famsup      1     1.111 2081.3 807.49
## - famsize     1     3.856 2084.1 808.29
## - Fedu        1     3.934 2084.1 808.31
## - Dalc        1     6.324 2086.5 809.00
## <none>                    2080.2 809.17
## - goout       1     8.824 2089.0 809.73
## - reason      3    23.423 2103.6 809.94
## - romantic    1     9.903 2090.1 810.04
## - freetime    1    12.169 2092.4 810.70
## - internet    1    12.666 2092.9 810.84
## - activities  1    14.584 2094.8 811.39
## - paid        1    16.850 2097.1 812.05
## - health      1    19.929 2100.1 812.94
## - Fjob        4    42.341 2122.5 813.36
## - Medu        1    22.303 2102.5 813.62
## - famrel      1    26.830 2107.0 814.92
## - age         1    40.502 2120.7 818.83
## - sex         1    42.236 2122.4 819.33
## - school      1    50.790 2131.0 821.76
## - studytime   1    54.488 2134.7 822.81
## - absences    1    71.127 2151.3 827.51
## - higher      1   105.071 2185.3 836.98
## - schoolsup   1   127.954 2208.2 843.28
## - failures    1   213.834 2294.0 866.37
## 
## Step:  AIC=807.41
## G3 ~ school + sex + age + address + famsize + Medu + Fedu + Fjob + 
##     reason + studytime + failures + schoolsup + famsup + paid + 
##     activities + higher + internet + romantic + famrel + freetime + 
##     goout + Dalc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - address     1     0.993 2082.0 805.69
## - famsup      1     1.042 2082.1 805.71
## - Fedu        1     3.604 2084.6 806.45
## - famsize     1     3.661 2084.7 806.47
## <none>                    2081.0 807.41
## - reason      3    23.097 2104.1 808.08
## - romantic    1     9.726 2090.7 808.23
## - freetime    1    11.921 2092.9 808.86
## - Dalc        1    11.939 2092.9 808.87
## - goout       1    11.997 2093.0 808.88
## - internet    1    12.633 2093.7 809.07
## - activities  1    14.853 2095.9 809.71
## - paid        1    17.124 2098.1 810.36
## - health      1    21.340 2102.3 811.58
## - Medu        1    23.197 2104.2 812.11
## - Fjob        4    45.301 2126.3 812.43
## - famrel      1    28.626 2109.6 813.67
## - age         1    40.680 2121.7 817.12
## - sex         1    45.613 2126.6 818.52
## - school      1    50.281 2131.3 819.85
## - studytime   1    57.018 2138.0 821.76
## - absences    1    72.242 2153.2 826.05
## - higher      1   104.379 2185.4 835.01
## - schoolsup   1   127.137 2208.2 841.28
## - failures    1   213.748 2294.8 864.56
## 
## Step:  AIC=805.69
## G3 ~ school + sex + age + famsize + Medu + Fedu + Fjob + reason + 
##     studytime + failures + schoolsup + famsup + paid + activities + 
##     higher + internet + romantic + famrel + freetime + goout + 
##     Dalc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - famsup      1     1.140 2083.2 804.03
## - Fedu        1     3.714 2085.7 804.77
## - famsize     1     3.927 2085.9 804.83
## <none>                    2082.0 805.69
## - reason      3    23.156 2105.2 806.39
## - romantic    1     9.869 2091.9 806.56
## - freetime    1    11.882 2093.9 807.14
## - goout       1    11.908 2093.9 807.14
## - Dalc        1    12.473 2094.5 807.31
## - internet    1    13.298 2095.3 807.55
## - activities  1    14.458 2096.5 807.88
## - paid        1    17.585 2099.6 808.78
## - health      1    21.367 2103.4 809.87
## - Fjob        4    44.911 2126.9 810.61
## - Medu        1    24.065 2106.1 810.65
## - famrel      1    28.079 2110.1 811.80
## - age         1    40.925 2122.9 815.47
## - sex         1    45.681 2127.7 816.82
## - studytime   1    57.100 2139.1 820.06
## - school      1    59.466 2141.5 820.73
## - absences    1    72.019 2154.0 824.27
## - higher      1   105.133 2187.1 833.50
## - schoolsup   1   127.816 2209.8 839.74
## - failures    1   213.131 2295.1 862.66
## 
## Step:  AIC=804.03
## G3 ~ school + sex + age + famsize + Medu + Fedu + Fjob + reason + 
##     studytime + failures + schoolsup + paid + activities + higher + 
##     internet + romantic + famrel + freetime + goout + Dalc + 
##     health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - Fedu        1     3.515 2086.7 803.05
## - famsize     1     4.056 2087.2 803.20
## <none>                    2083.2 804.03
## - reason      3    22.727 2105.9 804.59
## - romantic    1     9.737 2092.9 804.85
## - goout       1    11.898 2095.0 805.47
## - freetime    1    11.936 2095.1 805.48
## - Dalc        1    12.922 2096.1 805.77
## - internet    1    12.930 2096.1 805.77
## - activities  1    14.958 2098.1 806.35
## - paid        1    18.428 2101.6 807.35
## - health      1    21.942 2105.1 808.36
## - Medu        1    23.554 2106.7 808.83
## - Fjob        4    44.784 2127.9 808.89
## - famrel      1    27.970 2111.1 810.09
## - age         1    43.007 2126.2 814.39
## - sex         1    44.556 2127.7 814.83
## - studytime   1    56.045 2139.2 818.09
## - school      1    59.976 2143.1 819.20
## - absences    1    72.734 2155.9 822.79
## - higher      1   104.729 2187.9 831.70
## - schoolsup   1   127.807 2210.9 838.05
## - failures    1   217.335 2300.5 862.07
## 
## Step:  AIC=803.05
## G3 ~ school + sex + age + famsize + Medu + Fjob + reason + studytime + 
##     failures + schoolsup + paid + activities + higher + internet + 
##     romantic + famrel + freetime + goout + Dalc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## - famsize     1     3.754 2090.4 802.13
## <none>                    2086.7 803.05
## - reason      3    22.800 2109.5 803.62
## - romantic    1    10.640 2097.3 804.12
## - goout       1    11.507 2098.2 804.37
## - freetime    1    11.660 2098.3 804.42
## - Dalc        1    13.083 2099.7 804.83
## - internet    1    13.386 2100.1 804.91
## - activities  1    14.977 2101.6 805.37
## - paid        1    17.606 2104.3 806.13
## - health      1    21.334 2108.0 807.20
## - famrel      1    28.445 2115.1 809.24
## - Fjob        4    52.259 2138.9 810.01
## - age         1    43.133 2129.8 813.42
## - sex         1    44.115 2130.8 813.70
## - Medu        1    50.688 2137.3 815.57
## - studytime   1    55.155 2141.8 816.83
## - school      1    59.666 2146.3 818.10
## - absences    1    70.718 2157.4 821.21
## - higher      1   106.831 2193.5 831.25
## - schoolsup   1   126.044 2212.7 836.53
## - failures    1   221.312 2308.0 862.03
## 
## Step:  AIC=802.13
## G3 ~ school + sex + age + Medu + Fjob + reason + studytime + 
##     failures + schoolsup + paid + activities + higher + internet + 
##     romantic + famrel + freetime + goout + Dalc + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## <none>                    2090.4 802.13
## - reason      3    23.848 2114.3 803.00
## - romantic    1    10.788 2101.2 803.25
## - goout       1    11.740 2102.2 803.52
## - freetime    1    12.052 2102.5 803.61
## - Dalc        1    12.275 2102.7 803.68
## - internet    1    13.419 2103.8 804.00
## - activities  1    14.861 2105.3 804.42
## - paid        1    18.437 2108.8 805.45
## - health      1    21.240 2111.7 806.25
## - famrel      1    28.424 2118.8 808.30
## - Fjob        4    50.891 2141.3 808.69
## - sex         1    42.049 2132.5 812.18
## - age         1    43.878 2134.3 812.70
## - Medu        1    50.504 2140.9 814.58
## - studytime   1    54.811 2145.2 815.79
## - school      1    58.181 2148.6 816.74
## - absences    1    71.868 2162.3 820.58
## - higher      1   107.015 2197.4 830.34
## - schoolsup   1   126.876 2217.3 835.78
## - failures    1   227.171 2317.6 862.55
summary(selectedMod21)
## 
## Call:
## lm(formula = G3 ~ school + sex + age + Medu + Fjob + reason + 
##     studytime + failures + schoolsup + paid + activities + higher + 
##     internet + romantic + famrel + freetime + goout + Dalc + 
##     health + absences, data = d2_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4000 -1.3234 -0.0888  1.3319  5.4844 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       6.60115    1.37637   4.796 2.06e-06 ***
## schoolMS         -0.76523    0.19063  -4.014 6.75e-05 ***
## sexM             -0.61073    0.17896  -3.413 0.000688 ***
## age               0.25184    0.07224   3.486 0.000527 ***
## Medu              0.29679    0.07935   3.740 0.000202 ***
## Fjobhealth        0.23538    0.54177   0.434 0.664114    
## Fjobother        -0.13557    0.33331  -0.407 0.684356    
## Fjobservices     -0.36848    0.35068  -1.051 0.293814    
## Fjobteacher       1.04922    0.47955   2.188 0.029073 *  
## reasonhome        0.36365    0.20747   1.753 0.080169 .  
## reasonother      -0.02469    0.27448  -0.090 0.928342    
## reasonreputation  0.47303    0.21358   2.215 0.027168 *  
## studytime         0.39417    0.10116   3.896 0.000109 ***
## failures         -1.24176    0.15654  -7.932 1.12e-14 ***
## schoolsupyes     -1.59164    0.26849  -5.928 5.27e-09 ***
## paidyes          -0.76440    0.33826  -2.260 0.024205 *  
## activitiesyes     0.33105    0.16317   2.029 0.042935 *  
## higheryes         1.55537    0.28568   5.444 7.69e-08 ***
## internetyes       0.38471    0.19955   1.928 0.054359 .  
## romanticyes      -0.28984    0.16767  -1.729 0.084411 .  
## famrel            0.23912    0.08522   2.806 0.005187 ** 
## freetime         -0.15347    0.08400  -1.827 0.068210 .  
## goout            -0.13572    0.07527  -1.803 0.071873 .  
## Dalc             -0.18284    0.09916  -1.844 0.065707 .  
## health           -0.13558    0.05590  -2.425 0.015593 *  
## absences         -0.08364    0.01875  -4.462 9.78e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.9 on 579 degrees of freedom
## Multiple R-squared:  0.4663, Adjusted R-squared:  0.4432 
## F-statistic: 20.23 on 25 and 579 DF,  p-value: < 2.2e-16
sigm21 <- lm(G3~sex+age+Medu+reason+Fjob+studytime+failures+schoolsup+paid+activities+internet+romantic+famrel+freetime+goout+health+absences,data=d2_clean)
summary(sigm21)
## 
## Call:
## lm(formula = G3 ~ sex + age + Medu + reason + Fjob + studytime + 
##     failures + schoolsup + paid + activities + internet + romantic + 
##     famrel + freetime + goout + health + absences, data = d2_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.1229 -1.3318 -0.0946  1.2837  5.4003 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       7.97432    1.36361   5.848 8.30e-09 ***
## sexM             -0.69949    0.17801  -3.929 9.54e-05 ***
## age               0.17116    0.07380   2.319  0.02073 *  
## Medu              0.42002    0.08031   5.230 2.37e-07 ***
## reasonhome        0.47532    0.21437   2.217  0.02699 *  
## reasonother      -0.26681    0.28021  -0.952  0.34140    
## reasonreputation  0.57658    0.22147   2.603  0.00947 ** 
## Fjobhealth        0.52233    0.56089   0.931  0.35212    
## Fjobother         0.13661    0.34311   0.398  0.69066    
## Fjobservices     -0.16830    0.36192  -0.465  0.64210    
## Fjobteacher       1.36871    0.49519   2.764  0.00589 ** 
## studytime         0.52701    0.10344   5.095 4.73e-07 ***
## failures         -1.40982    0.16079  -8.768  < 2e-16 ***
## schoolsupyes     -1.40709    0.27621  -5.094 4.74e-07 ***
## paidyes          -0.82036    0.35157  -2.333  0.01997 *  
## activitiesyes     0.38731    0.16941   2.286  0.02260 *  
## internetyes       0.54833    0.20452   2.681  0.00755 ** 
## romanticyes      -0.37064    0.17403  -2.130  0.03360 *  
## famrel            0.27922    0.08794   3.175  0.00158 ** 
## freetime         -0.19727    0.08716  -2.263  0.02398 *  
## goout            -0.18320    0.07637  -2.399  0.01676 *  
## health           -0.10819    0.05793  -1.868  0.06231 .  
## absences         -0.08664    0.01882  -4.604 5.08e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.977 on 582 degrees of freedom
## Multiple R-squared:  0.4193, Adjusted R-squared:  0.3974 
## F-statistic:  19.1 on 22 and 582 DF,  p-value: < 2.2e-16
selectedMod22 <- step(sigm21)
## Start:  AIC=847.17
## G3 ~ sex + age + Medu + reason + Fjob + studytime + failures + 
##     schoolsup + paid + activities + internet + romantic + famrel + 
##     freetime + goout + health + absences
## 
##              Df Sum of Sq    RSS    AIC
## <none>                    2274.4 847.17
## - health      1    13.632 2288.1 848.79
## - romantic    1    17.727 2292.2 849.87
## - freetime    1    20.021 2294.4 850.47
## - activities  1    20.425 2294.8 850.58
## - age         1    21.019 2295.4 850.74
## - paid        1    21.278 2295.7 850.81
## - goout       1    22.487 2296.9 851.12
## - internet    1    28.090 2302.5 852.60
## - reason      3    48.684 2323.1 853.99
## - Fjob        4    60.650 2335.1 855.09
## - famrel      1    39.394 2313.8 855.56
## - sex         1    60.342 2334.8 861.01
## - absences    1    82.852 2357.3 866.82
## - schoolsup   1   101.415 2375.8 871.56
## - studytime   1   101.431 2375.8 871.57
## - Medu        1   106.895 2381.3 872.96
## - failures    1   300.434 2574.8 920.23
summary(selectedMod22)
## 
## Call:
## lm(formula = G3 ~ sex + age + Medu + reason + Fjob + studytime + 
##     failures + schoolsup + paid + activities + internet + romantic + 
##     famrel + freetime + goout + health + absences, data = d2_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.1229 -1.3318 -0.0946  1.2837  5.4003 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       7.97432    1.36361   5.848 8.30e-09 ***
## sexM             -0.69949    0.17801  -3.929 9.54e-05 ***
## age               0.17116    0.07380   2.319  0.02073 *  
## Medu              0.42002    0.08031   5.230 2.37e-07 ***
## reasonhome        0.47532    0.21437   2.217  0.02699 *  
## reasonother      -0.26681    0.28021  -0.952  0.34140    
## reasonreputation  0.57658    0.22147   2.603  0.00947 ** 
## Fjobhealth        0.52233    0.56089   0.931  0.35212    
## Fjobother         0.13661    0.34311   0.398  0.69066    
## Fjobservices     -0.16830    0.36192  -0.465  0.64210    
## Fjobteacher       1.36871    0.49519   2.764  0.00589 ** 
## studytime         0.52701    0.10344   5.095 4.73e-07 ***
## failures         -1.40982    0.16079  -8.768  < 2e-16 ***
## schoolsupyes     -1.40709    0.27621  -5.094 4.74e-07 ***
## paidyes          -0.82036    0.35157  -2.333  0.01997 *  
## activitiesyes     0.38731    0.16941   2.286  0.02260 *  
## internetyes       0.54833    0.20452   2.681  0.00755 ** 
## romanticyes      -0.37064    0.17403  -2.130  0.03360 *  
## famrel            0.27922    0.08794   3.175  0.00158 ** 
## freetime         -0.19727    0.08716  -2.263  0.02398 *  
## goout            -0.18320    0.07637  -2.399  0.01676 *  
## health           -0.10819    0.05793  -1.868  0.06231 .  
## absences         -0.08664    0.01882  -4.604 5.08e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.977 on 582 degrees of freedom
## Multiple R-squared:  0.4193, Adjusted R-squared:  0.3974 
## F-statistic:  19.1 on 22 and 582 DF,  p-value: < 2.2e-16
sigm22 <- lm(G3~sex+age+Medu+reason+Fjob+studytime+failures+schoolsup+paid+activities+internet+romantic+famrel+freetime+goout+absences, data=d2_clean)
summary(sigm22)
## 
## Call:
## lm(formula = G3 ~ sex + age + Medu + reason + Fjob + studytime + 
##     failures + schoolsup + paid + activities + internet + romantic + 
##     famrel + freetime + goout + absences, data = d2_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2110 -1.3170 -0.0422  1.2400  5.6566 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       7.71097    1.35919   5.673 2.21e-08 ***
## sexM             -0.73985    0.17707  -4.178 3.39e-05 ***
## age               0.16868    0.07395   2.281  0.02291 *  
## Medu              0.42280    0.08047   5.254 2.09e-07 ***
## reasonhome        0.49467    0.21458   2.305  0.02150 *  
## reasonother      -0.25258    0.28071  -0.900  0.36860    
## reasonreputation  0.63113    0.22001   2.869  0.00427 ** 
## Fjobhealth        0.40472    0.55854   0.725  0.46898    
## Fjobother         0.11551    0.34366   0.336  0.73691    
## Fjobservices     -0.16329    0.36268  -0.450  0.65272    
## Fjobteacher       1.33718    0.49595   2.696  0.00722 ** 
## studytime         0.52948    0.10366   5.108 4.42e-07 ***
## failures         -1.40979    0.16113  -8.749  < 2e-16 ***
## schoolsupyes     -1.41881    0.27673  -5.127 4.01e-07 ***
## paidyes          -0.86332    0.35156  -2.456  0.01435 *  
## activitiesyes     0.38266    0.16976   2.254  0.02456 *  
## internetyes       0.55993    0.20486   2.733  0.00646 ** 
## romanticyes      -0.36747    0.17439  -2.107  0.03553 *  
## famrel            0.26294    0.08770   2.998  0.00283 ** 
## freetime         -0.20948    0.08710  -2.405  0.01648 *  
## goout            -0.17498    0.07641  -2.290  0.02237 *  
## absences         -0.08665    0.01886  -4.595 5.30e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.981 on 583 degrees of freedom
## Multiple R-squared:  0.4158, Adjusted R-squared:  0.3948 
## F-statistic: 19.76 on 21 and 583 DF,  p-value: < 2.2e-16
#best subsets selection
allsubs2 <- regsubsets(G3~., data=d2_clean)
summary(allsubs2)
## Subset selection object
## Call: regsubsets.formula(G3 ~ ., data = d2_clean)
## 39 Variables  (and intercept)
##                  Forced in Forced out
## schoolMS             FALSE      FALSE
## sexM                 FALSE      FALSE
## age                  FALSE      FALSE
## addressU             FALSE      FALSE
## famsizeLE3           FALSE      FALSE
## PstatusT             FALSE      FALSE
## Medu                 FALSE      FALSE
## Fedu                 FALSE      FALSE
## Mjobhealth           FALSE      FALSE
## Mjobother            FALSE      FALSE
## Mjobservices         FALSE      FALSE
## Mjobteacher          FALSE      FALSE
## Fjobhealth           FALSE      FALSE
## Fjobother            FALSE      FALSE
## Fjobservices         FALSE      FALSE
## Fjobteacher          FALSE      FALSE
## reasonhome           FALSE      FALSE
## reasonother          FALSE      FALSE
## reasonreputation     FALSE      FALSE
## guardianmother       FALSE      FALSE
## guardianother        FALSE      FALSE
## traveltime           FALSE      FALSE
## studytime            FALSE      FALSE
## failures             FALSE      FALSE
## schoolsupyes         FALSE      FALSE
## famsupyes            FALSE      FALSE
## paidyes              FALSE      FALSE
## activitiesyes        FALSE      FALSE
## nurseryyes           FALSE      FALSE
## higheryes            FALSE      FALSE
## internetyes          FALSE      FALSE
## romanticyes          FALSE      FALSE
## famrel               FALSE      FALSE
## freetime             FALSE      FALSE
## goout                FALSE      FALSE
## Dalc                 FALSE      FALSE
## Walc                 FALSE      FALSE
## health               FALSE      FALSE
## absences             FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          schoolMS sexM age addressU famsizeLE3 PstatusT Medu Fedu Mjobhealth
## 1  ( 1 ) " "      " "  " " " "      " "        " "      " "  " "  " "       
## 2  ( 1 ) " "      " "  " " " "      " "        " "      " "  " "  " "       
## 3  ( 1 ) " "      " "  " " " "      " "        " "      "*"  " "  " "       
## 4  ( 1 ) " "      " "  " " " "      " "        " "      "*"  " "  " "       
## 5  ( 1 ) " "      " "  " " " "      " "        " "      "*"  " "  " "       
## 6  ( 1 ) " "      " "  " " " "      " "        " "      "*"  " "  " "       
## 7  ( 1 ) "*"      " "  " " " "      " "        " "      "*"  " "  " "       
## 8  ( 1 ) "*"      "*"  " " " "      " "        " "      "*"  " "  " "       
##          Mjobother Mjobservices Mjobteacher Fjobhealth Fjobother Fjobservices
## 1  ( 1 ) " "       " "          " "         " "        " "       " "         
## 2  ( 1 ) " "       " "          " "         " "        " "       " "         
## 3  ( 1 ) " "       " "          " "         " "        " "       " "         
## 4  ( 1 ) " "       " "          " "         " "        " "       " "         
## 5  ( 1 ) " "       " "          " "         " "        " "       " "         
## 6  ( 1 ) " "       " "          " "         " "        " "       " "         
## 7  ( 1 ) " "       " "          " "         " "        " "       " "         
## 8  ( 1 ) " "       " "          " "         " "        " "       " "         
##          Fjobteacher reasonhome reasonother reasonreputation guardianmother
## 1  ( 1 ) " "         " "        " "         " "              " "           
## 2  ( 1 ) " "         " "        " "         " "              " "           
## 3  ( 1 ) " "         " "        " "         " "              " "           
## 4  ( 1 ) " "         " "        " "         " "              " "           
## 5  ( 1 ) " "         " "        " "         " "              " "           
## 6  ( 1 ) " "         " "        " "         " "              " "           
## 7  ( 1 ) " "         " "        " "         " "              " "           
## 8  ( 1 ) " "         " "        " "         " "              " "           
##          guardianother traveltime studytime failures schoolsupyes famsupyes
## 1  ( 1 ) " "           " "        " "       "*"      " "          " "      
## 2  ( 1 ) " "           " "        " "       "*"      " "          " "      
## 3  ( 1 ) " "           " "        "*"       "*"      " "          " "      
## 4  ( 1 ) " "           " "        "*"       "*"      " "          " "      
## 5  ( 1 ) " "           " "        "*"       "*"      "*"          " "      
## 6  ( 1 ) " "           " "        "*"       "*"      "*"          " "      
## 7  ( 1 ) " "           " "        "*"       "*"      "*"          " "      
## 8  ( 1 ) " "           " "        "*"       "*"      "*"          " "      
##          paidyes activitiesyes nurseryyes higheryes internetyes romanticyes
## 1  ( 1 ) " "     " "           " "        " "       " "         " "        
## 2  ( 1 ) " "     " "           " "        "*"       " "         " "        
## 3  ( 1 ) " "     " "           " "        " "       " "         " "        
## 4  ( 1 ) " "     " "           " "        "*"       " "         " "        
## 5  ( 1 ) " "     " "           " "        "*"       " "         " "        
## 6  ( 1 ) " "     " "           " "        "*"       " "         " "        
## 7  ( 1 ) " "     " "           " "        "*"       " "         " "        
## 8  ( 1 ) " "     " "           " "        "*"       " "         " "        
##          famrel freetime goout Dalc Walc health absences
## 1  ( 1 ) " "    " "      " "   " "  " "  " "    " "     
## 2  ( 1 ) " "    " "      " "   " "  " "  " "    " "     
## 3  ( 1 ) " "    " "      " "   " "  " "  " "    " "     
## 4  ( 1 ) " "    " "      " "   " "  " "  " "    " "     
## 5  ( 1 ) " "    " "      " "   " "  " "  " "    " "     
## 6  ( 1 ) " "    " "      " "   "*"  " "  " "    " "     
## 7  ( 1 ) " "    " "      " "   " "  "*"  " "    " "     
## 8  ( 1 ) " "    " "      " "   " "  " "  " "    "*"
sigm23 <- lm(G3~school+sex+Medu+studytime+failures+schoolsup+higher+absences,data=d2_clean)
summary(sigm23)
## 
## Call:
## lm(formula = G3 ~ school + sex + Medu + studytime + failures + 
##     schoolsup + higher + absences, data = d2_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.5745 -1.4500 -0.1443  1.4255  5.8460 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.34023    0.40705  25.403  < 2e-16 ***
## schoolMS     -1.00702    0.18797  -5.357 1.21e-07 ***
## sexM         -0.77902    0.17235  -4.520 7.46e-06 ***
## Medu          0.38118    0.07735   4.928 1.08e-06 ***
## studytime     0.46232    0.10326   4.477 9.07e-06 ***
## failures     -1.29892    0.15530  -8.364 4.30e-16 ***
## schoolsupyes -1.68843    0.27114  -6.227 8.97e-10 ***
## higheryes     1.51497    0.29252   5.179 3.05e-07 ***
## absences     -0.08721    0.01874  -4.653 4.03e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.988 on 596 degrees of freedom
## Multiple R-squared:  0.3988, Adjusted R-squared:  0.3907 
## F-statistic: 49.42 on 8 and 596 DF,  p-value: < 2.2e-16
summ2 <- summary(sigm23)  # model summary
pvals <- summ2[[4]][, 4]  # get all p values
significant2 <- character()  # init variables that aren't statsitically significant
significant2 <- names(which(pvals < 0.5))
significant2 <- significant[!significant %in% "(Intercept)"]
show(significant2)
## [1] "Mjobhealth"   "Mjobservices" "Mjobteacher"  "Fjobteacher"  "studytime"   
## [6] "failures"     "schoolsupyes" "famsupyes"    "goout"
##10-fold CV
nfolds <- 10
fold2=createFolds(1:nrow(d2_clean),k=nfolds, list=FALSE)
mlist2 <- c(5,6,7,8,9,10)
msetemp2 <- rep(NA, 10)
mselist2 <- rep(NA, 6)
for(i in 1:length(mlist2)){
  mvalue2 <- mlist2[i]
  for(j in 1:nfolds){
  pred_rf2 <- randomForest(G3~., data=d2_clean[fold2==j,], mtry=mvalue2, ntree=2000, importance=TRUE)
  msetemp2[j] <- mean((d2_clean[fold2==j,31]-predict(pred_rf2, d2_clean[fold2==j,], type="response"))^2)
  }
  mselist2[i] = mean(msetemp)
}
head(mselist2)
## [1] 1.807417 1.807417 1.807417 1.807417 1.807417 1.807417
rf2 <- randomForest(G3~., data=d2_clean, mtry=10, ntree=2000, importance=TRUE)
rf2
## 
## Call:
##  randomForest(formula = G3 ~ ., data = d2_clean, mtry = 10, ntree = 2000,      importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 2000
## No. of variables tried at each split: 10
## 
##           Mean of squared residuals: 3.695536
##                     % Var explained: 42.92
plot(rf2)

plot(d2_clean$G3, predict(rf2, d2_clean, type="response"), xlab="actual observation", ylab="Predicted", main="Random Forest Prediction on Portugese")

importance(rf2, type=1)
##               %IncMSE
## school     18.0664467
## sex        13.3445122
## age        15.2529938
## address     9.1169507
## famsize    -0.6528098
## Pstatus     3.1079510
## Medu       33.2634348
## Fedu       16.3226127
## Mjob        9.5370635
## Fjob        4.6989390
## reason     11.1969598
## guardian    3.8855024
## traveltime  7.1171375
## studytime  25.9266270
## failures   87.9260692
## schoolsup  37.1837215
## famsup      4.4550086
## paid        1.9551904
## activities  4.2210242
## nursery     3.7619894
## higher     53.3972202
## internet   12.4826857
## romantic    1.2542726
## famrel     11.2345019
## freetime    5.9535404
## goout       4.7948698
## Dalc       21.8644469
## Walc       18.5075529
## health     10.1653974
## absences    9.3591688
varImpPlot(rf2, sort=TRUE, n.var=8,main="Variable Importance Plot Portuguese")