#Importing the documents
library(readr)
train <- read_csv("C:/Users/chink/Google Drive/Computational Statistics/Kaggle/Jan2018/train.csv")
Parsed with column specification:
cols(
id = col_integer(),
spacegroup = col_integer(),
number_of_total_atoms = col_integer(),
percent_atom_al = col_double(),
percent_atom_ga = col_double(),
percent_atom_in = col_double(),
lattice_vector_1_ang = col_double(),
lattice_vector_2_ang = col_double(),
lattice_vector_3_ang = col_double(),
lattice_angle_alpha_degree = col_double(),
lattice_angle_beta_degree = col_double(),
lattice_angle_gamma_degree = col_double(),
formation_energy_ev_natom = col_double(),
bandgap_energy_ev = col_double()
)
test <- read_csv("C:/Users/chink/Google Drive/Computational Statistics/Kaggle/Jan2018/test.csv")
Parsed with column specification:
cols(
id = col_integer(),
spacegroup = col_integer(),
number_of_total_atoms = col_integer(),
percent_atom_al = col_double(),
percent_atom_ga = col_double(),
percent_atom_in = col_double(),
lattice_vector_1_ang = col_double(),
lattice_vector_2_ang = col_double(),
lattice_vector_3_ang = col_double(),
lattice_angle_alpha_degree = col_double(),
lattice_angle_beta_degree = col_double(),
lattice_angle_gamma_degree = col_double()
)
#Checking for header
head(train)
#Structure of the data
str(train)
Classes tbl_df, tbl and 'data.frame': 2400 obs. of 14 variables:
$ id : int 1 2 3 4 5 6 7 8 9 10 ...
$ spacegroup : int 33 194 227 167 194 227 206 12 206 194 ...
$ number_of_total_atoms : int 80 80 40 30 80 40 80 20 80 80 ...
$ percent_atom_al : num 0.625 0.625 0.812 0.75 0 ...
$ percent_atom_ga : num 0.375 0.375 0.188 0 0.625 ...
$ percent_atom_in : num 0 0 0 0.25 0.375 0 0.875 0.5 0.25 0 ...
$ lattice_vector_1_ang : num 9.95 6.18 9.75 5 6.66 ...
$ lattice_vector_2_ang : num 8.55 6.18 5.66 5 6.66 ...
$ lattice_vector_3_ang : num 9.18 23.63 13.96 13.53 24.58 ...
$ lattice_angle_alpha_degree: num 90 90 91 90 90 ...
$ lattice_angle_beta_degree : num 90 90 91.1 90 90 ...
$ lattice_angle_gamma_degree: num 90 120 30.5 120 120 ...
$ formation_energy_ev_natom : num 0.068 0.249 0.1821 0.2172 0.0505 ...
$ bandgap_energy_ev : num 3.44 2.92 2.74 3.35 1.38 ...
- attr(*, "spec")=List of 2
..$ cols :List of 14
.. ..$ id : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ spacegroup : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ number_of_total_atoms : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ percent_atom_al : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ percent_atom_ga : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ percent_atom_in : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ lattice_vector_1_ang : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ lattice_vector_2_ang : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ lattice_vector_3_ang : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ lattice_angle_alpha_degree: list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ lattice_angle_beta_degree : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ lattice_angle_gamma_degree: list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ formation_energy_ev_natom : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ bandgap_energy_ev : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
..$ default: list()
.. ..- attr(*, "class")= chr "collector_guess" "collector"
..- attr(*, "class")= chr "col_spec"
#Structure of test dataset
str(test)
Classes tbl_df, tbl and 'data.frame': 600 obs. of 12 variables:
$ id : int 1 2 3 4 5 6 7 8 9 10 ...
$ spacegroup : int 33 33 167 12 12 33 167 33 33 206 ...
$ number_of_total_atoms : int 80 80 30 80 80 40 30 80 80 80 ...
$ percent_atom_al : num 0.188 0.75 0.667 0.562 0.188 ...
$ percent_atom_ga : num 0.469 0.25 0.167 0.438 0.5 ...
$ percent_atom_in : num 0.344 0 0.167 0 0.312 ...
$ lattice_vector_1_ang : num 10.54 9.89 4.98 24.34 24.64 ...
$ lattice_vector_2_ang : num 9.01 8.5 4.98 6.01 6.29 ...
$ lattice_vector_3_ang : num 9.64 9.13 13.48 5.76 6.16 ...
$ lattice_angle_alpha_degree: num 90 90 90 90 90 ...
$ lattice_angle_beta_degree : num 90 90 90 104 105 ...
$ lattice_angle_gamma_degree: num 90 90 120 90 90 ...
- attr(*, "spec")=List of 2
..$ cols :List of 12
.. ..$ id : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ spacegroup : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ number_of_total_atoms : list()
.. .. ..- attr(*, "class")= chr "collector_integer" "collector"
.. ..$ percent_atom_al : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ percent_atom_ga : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ percent_atom_in : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ lattice_vector_1_ang : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ lattice_vector_2_ang : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ lattice_vector_3_ang : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ lattice_angle_alpha_degree: list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ lattice_angle_beta_degree : list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
.. ..$ lattice_angle_gamma_degree: list()
.. .. ..- attr(*, "class")= chr "collector_double" "collector"
..$ default: list()
.. ..- attr(*, "class")= chr "collector_guess" "collector"
..- attr(*, "class")= chr "col_spec"
#Summary of training and test data
summary(train)
id spacegroup number_of_total_atoms percent_atom_al percent_atom_ga
Min. : 1.0 Min. : 12.0 Min. :10.00 Min. :0.0000 Min. :0.0000
1st Qu.: 600.8 1st Qu.: 33.0 1st Qu.:40.00 1st Qu.:0.1667 1st Qu.:0.0938
Median :1200.5 Median :194.0 Median :80.00 Median :0.3750 Median :0.2812
Mean :1200.5 Mean :141.5 Mean :61.68 Mean :0.3854 Mean :0.3086
3rd Qu.:1800.2 3rd Qu.:206.0 3rd Qu.:80.00 3rd Qu.:0.5833 3rd Qu.:0.4688
Max. :2400.0 Max. :227.0 Max. :80.00 Max. :1.0000 Max. :1.0000
percent_atom_in lattice_vector_1_ang lattice_vector_2_ang lattice_vector_3_ang
Min. :0.0000 Min. : 3.037 Min. : 2.942 Min. : 5.673
1st Qu.:0.0625 1st Qu.: 6.141 1st Qu.: 5.834 1st Qu.: 9.298
Median :0.2500 Median : 9.537 Median : 6.383 Median :10.125
Mean :0.3060 Mean :10.030 Mean : 7.087 Mean :12.593
3rd Qu.:0.4688 3rd Qu.:10.292 3rd Qu.: 9.093 3rd Qu.:14.372
Max. :1.0000 Max. :24.913 Max. :10.290 Max. :25.346
lattice_angle_alpha_degree lattice_angle_beta_degree lattice_angle_gamma_degree
Min. : 82.74 Min. : 81.64 Min. : 29.73
1st Qu.: 90.00 1st Qu.: 90.00 1st Qu.: 90.00
Median : 90.00 Median : 90.00 Median : 90.00
Mean : 90.24 Mean : 92.40 Mean : 94.79
3rd Qu.: 90.01 3rd Qu.: 90.01 3rd Qu.:120.00
Max. :101.23 Max. :106.17 Max. :120.05
formation_energy_ev_natom bandgap_energy_ev
Min. :0.0000 Min. :0.0001
1st Qu.:0.1056 1st Qu.:1.2785
Median :0.1818 Median :1.9079
Mean :0.1876 Mean :2.0772
3rd Qu.:0.2563 3rd Qu.:2.7620
Max. :0.6572 Max. :5.2861
summary(test)
id spacegroup number_of_total_atoms percent_atom_al percent_atom_ga
Min. : 1.0 Min. : 12.0 Min. :10.00 Min. :0.0000 Min. :0.0000
1st Qu.:150.8 1st Qu.: 33.0 1st Qu.:40.00 1st Qu.:0.1250 1st Qu.:0.0938
Median :300.5 Median :194.0 Median :80.00 Median :0.3750 Median :0.2500
Mean :300.5 Mean :139.6 Mean :61.73 Mean :0.3710 Mean :0.3133
3rd Qu.:450.2 3rd Qu.:206.0 3rd Qu.:80.00 3rd Qu.:0.5625 3rd Qu.:0.4688
Max. :600.0 Max. :227.0 Max. :80.00 Max. :1.0000 Max. :0.9688
percent_atom_in lattice_vector_1_ang lattice_vector_2_ang lattice_vector_3_ang
Min. :0.0000 Min. : 3.073 Min. : 2.960 Min. : 5.698
1st Qu.:0.0625 1st Qu.: 6.137 1st Qu.: 5.829 1st Qu.: 9.309
Median :0.2812 Median : 9.495 Median : 6.398 Median :10.097
Mean :0.3157 Mean :10.098 Mean : 7.082 Mean :12.442
3rd Qu.:0.4688 3rd Qu.:10.363 3rd Qu.: 9.157 3rd Qu.:14.328
Max. :0.9688 Max. :24.913 Max. :10.249 Max. :25.306
lattice_angle_alpha_degree lattice_angle_beta_degree lattice_angle_gamma_degree
Min. : 83.74 Min. : 82.75 Min. : 29.72
1st Qu.: 90.00 1st Qu.: 90.00 1st Qu.: 90.00
Median : 90.00 Median : 90.00 Median : 90.00
Mean : 90.16 Mean : 92.49 Mean : 96.33
3rd Qu.: 90.01 3rd Qu.: 90.01 3rd Qu.:120.00
Max. :100.95 Max. :105.98 Max. :120.05
#Checking for missing value
library(Amelia)
missmap(train)
the condition has length > 1 and only the first element will be usedUnknown or uninitialised column: 'arguments'.Unknown or uninitialised column: 'arguments'.Unknown or uninitialised column: 'imputations'.
From the plot, we can observe that no missing information in training dataset.
#Checking for missing data in test
library(Amelia)
missmap(test)
the condition has length > 1 and only the first element will be usedUnknown or uninitialised column: 'arguments'.Unknown or uninitialised column: 'arguments'.Unknown or uninitialised column: 'imputations'.
No missing data.
library(gridExtra)
library(ggplot2)
p1=ggplot(train,aes(x=percent_atom_al))+geom_histogram(col="red",aes(fill=..count..))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for percent_atom_al")
p2=ggplot(train,aes(x=percent_atom_ga))+geom_histogram(col="red",aes(fill=..count..))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for percent_atom_ga")
p3=ggplot(train,aes(x=percent_atom_in))+geom_histogram(col="red",aes(fill=..count..))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for percent_atom_in")
p4=ggplot(train,aes(x=lattice_vector_1_ang))+geom_histogram(col="red",aes(fill=..count..))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for lattice_vector_1_ang")
p5=ggplot(train,aes(x=lattice_vector_2_ang))+geom_histogram(col="red",aes(fill=..count..))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for lattice_vector_2_ang")
p6=ggplot(train,aes(x=lattice_vector_3_ang))+geom_histogram(col="red",aes(fill=..count..))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for lattice_vector_3_ang")
p7=ggplot(train,aes(x=lattice_angle_alpha_degree))+geom_histogram(col="red",aes(fill=..count..),breaks=seq(80,110,by=1))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for lattice_angle_alpha_degree")
p8=ggplot(train,aes(x=lattice_angle_beta_degree))+geom_histogram(col="red",aes(fill=..count..),breaks=seq(80,110,by=1))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for lattice_angle_beta_degree")
p9=ggplot(train,aes(x=lattice_angle_gamma_degree))+geom_histogram(col="red",aes(fill=..count..),breaks=seq(80,110,by=1))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for lattice_angle_gamma_degree")
p10=ggplot(train,aes(x=formation_energy_ev_natom))+geom_histogram(col="red",aes(fill=..count..))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for formation_energy_ev_natom")
grid.arrange(p1,p2,p3,p4,ncol=2,nrow=2)
grid.arrange(p5,p6,ncol=2,nrow=1)
grid.arrange(p7,p8,ncol=2,nrow=1)
grid.arrange(p9,p10,ncol=2,nrow=1)
#Converting variables as factor
train$spacegroup=as.factor(train$spacegroup)
train$number_of_total_atoms=as.factor(train$number_of_total_atoms)
#Scatterplot of varaibles
s1=ggplot(train,aes(percent_atom_al,percent_atom_ga,col=spacegroup))+geom_point()+geom_smooth()+facet_wrap(~number_of_total_atoms)
s2=ggplot(train,aes(percent_atom_al,percent_atom_in,col=spacegroup))+geom_point()+geom_smooth()+facet_wrap(~number_of_total_atoms)
s3=ggplot(train,aes(percent_atom_ga,percent_atom_in,col=spacegroup))+geom_point()+geom_smooth()+facet_wrap(~number_of_total_atoms)
s1
s2
s3
#Plot
s4=ggplot(train,aes(percent_atom_ga,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s4
#plots
s5=ggplot(train,aes(percent_atom_al,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s5
#plots
s6=ggplot(train,aes(percent_atom_in,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s6
#plots
s7=ggplot(train,aes(lattice_vector_1_ang,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s7
#plots
s8=ggplot(train,aes(lattice_vector_2_ang,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s8
#plots
s9=ggplot(train,aes(lattice_vector_3_ang,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s9
#plots
s10=ggplot(train,aes(lattice_angle_alpha_degree,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s10
#plots
s11=ggplot(train,aes(lattice_angle_beta_degree,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s11
#plots
s12=ggplot(train,aes(lattice_angle_gamma_degree,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s12
#plots
s13=ggplot(train,aes(formation_energy_ev_natom,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s13
#Subsetting the data by spacegroup
spacegroup33= train[which(train$spacegroup == "33"),]
summary(spacegroup33)
id spacegroup number_of_total_atoms percent_atom_al percent_atom_ga
Min. : 1.0 12 : 0 10: 0 Min. :0.0000 Min. :0.0000
1st Qu.: 519.8 33 :432 20: 0 1st Qu.:0.1562 1st Qu.:0.0625
Median :1156.5 167: 0 30: 0 Median :0.4062 Median :0.1875
Mean :1153.0 194: 0 40:126 Mean :0.4063 Mean :0.2629
3rd Qu.:1722.5 206: 0 60: 0 3rd Qu.:0.6250 3rd Qu.:0.4062
Max. :2399.0 227: 0 80:306 Max. :0.9688 Max. :1.0000
percent_atom_in lattice_vector_1_ang lattice_vector_2_ang lattice_vector_3_ang
Min. :0.0000 Min. : 4.903 Min. :8.414 Min. : 9.046
1st Qu.:0.0938 1st Qu.: 5.529 1st Qu.:8.651 1st Qu.: 9.277
Median :0.2500 Median :10.137 Median :8.881 Median : 9.505
Mean :0.3308 Mean : 8.889 Mean :8.916 Mean : 9.542
3rd Qu.:0.5312 3rd Qu.:10.481 3rd Qu.:9.168 3rd Qu.: 9.787
Max. :1.0000 Max. :11.284 Max. :9.640 Max. :10.264
lattice_angle_alpha_degree lattice_angle_beta_degree lattice_angle_gamma_degree
Min. :90.00 Min. :90 Min. :90
1st Qu.:90.00 1st Qu.:90 1st Qu.:90
Median :90.00 Median :90 Median :90
Mean :90.00 Mean :90 Mean :90
3rd Qu.:90.00 3rd Qu.:90 3rd Qu.:90
Max. :90.01 Max. :90 Max. :90
formation_energy_ev_natom bandgap_energy_ev
Min. :0.0254 Min. :0.6522
1st Qu.:0.1109 1st Qu.:1.3665
Median :0.1779 Median :2.0789
Mean :0.1692 Mean :2.1968
3rd Qu.:0.2308 3rd Qu.:2.9715
Max. :0.3520 Max. :4.6795
#Subsetting the data by spacegroup
spacegroup12= train[which(train$spacegroup == "12"),]
summary(spacegroup12)
id spacegroup number_of_total_atoms percent_atom_al percent_atom_ga
Min. : 8.0 12 :358 10: 0 Min. :0.0000 Min. :0.000
1st Qu.: 571.2 33 : 0 20: 84 1st Qu.:0.1562 1st Qu.:0.250
Median :1089.5 167: 0 30: 0 Median :0.3438 Median :0.375
Mean :1150.7 194: 0 40: 0 Mean :0.3186 Mean :0.387
3rd Qu.:1804.8 206: 0 60: 0 3rd Qu.:0.4375 3rd Qu.:0.500
Max. :2393.0 227: 0 80:274 Max. :1.0000 Max. :1.000
percent_atom_in lattice_vector_1_ang lattice_vector_2_ang lattice_vector_3_ang
Min. :0.0000 Min. :11.94 Min. :2.942 Min. :5.673
1st Qu.:0.1250 1st Qu.:24.09 1st Qu.:5.942 1st Qu.:5.949
Median :0.2812 Median :24.44 Median :6.212 Median :6.108
Mean :0.2944 Mean :21.64 Mean :5.517 Mean :6.114
3rd Qu.:0.4062 3rd Qu.:24.60 3rd Qu.:6.299 3rd Qu.:6.241
Max. :1.0000 Max. :24.91 Max. :6.676 Max. :6.905
lattice_angle_alpha_degree lattice_angle_beta_degree lattice_angle_gamma_degree
Min. :90 Min. :103.7 Min. :90
1st Qu.:90 1st Qu.:104.2 1st Qu.:90
Median :90 Median :104.6 Median :90
Mean :90 Mean :104.6 Mean :90
3rd Qu.:90 3rd Qu.:104.9 3rd Qu.:90
Max. :90 Max. :106.2 Max. :90
formation_energy_ev_natom bandgap_energy_ev
Min. :0.00000 Min. :0.722
1st Qu.:0.04295 1st Qu.:1.481
Median :0.12565 Median :1.857
Mean :0.12642 Mean :1.939
3rd Qu.:0.17985 3rd Qu.:2.251
Max. :0.40460 Max. :4.459
#Subsetting the data by spacegroup
spacegroup167= train[which(train$spacegroup == "167"),]
summary(spacegroup167)
id spacegroup number_of_total_atoms percent_atom_al percent_atom_ga
Min. : 4.0 12 : 0 10: 0 Min. :0.0000 Min. :0.0000
1st Qu.: 701.5 33 : 0 20: 0 1st Qu.:0.2500 1st Qu.:0.0833
Median :1271.5 167:374 30:326 Median :0.4167 Median :0.3333
Mean :1262.1 194: 0 40: 0 Mean :0.4334 Mean :0.3234
3rd Qu.:1859.2 206: 0 60: 48 3rd Qu.:0.6667 3rd Qu.:0.5000
Max. :2397.0 227: 0 80: 0 Max. :0.9167 Max. :1.0000
percent_atom_in lattice_vector_1_ang lattice_vector_2_ang lattice_vector_3_ang
Min. :0.0000 Min. : 4.833 Min. :4.833 Min. :13.17
1st Qu.:0.0000 1st Qu.: 4.957 1st Qu.:4.936 1st Qu.:13.38
Median :0.1667 Median : 5.067 Median :5.024 Median :13.57
Mean :0.2432 Mean : 5.737 Mean :5.078 Mean :13.68
3rd Qu.:0.4167 3rd Qu.: 5.278 3rd Qu.:5.193 3rd Qu.:13.93
Max. :1.0000 Max. :11.070 Max. :5.578 Max. :14.75
lattice_angle_alpha_degree lattice_angle_beta_degree lattice_angle_gamma_degree
Min. :89.98 Min. :90.00 Min. :120
1st Qu.:89.99 1st Qu.:90.00 1st Qu.:120
Median :89.99 Median :90.01 Median :120
Mean :89.99 Mean :90.01 Mean :120
3rd Qu.:90.00 3rd Qu.:90.01 3rd Qu.:120
Max. :90.00 Max. :90.02 Max. :120
formation_energy_ev_natom bandgap_energy_ev
Min. :0.01500 Min. :0.9439
1st Qu.:0.06872 1st Qu.:1.9798
Median :0.13315 Median :2.9968
Mean :0.13614 Mean :2.8962
3rd Qu.:0.19027 3rd Qu.:3.7161
Max. :0.35070 Max. :5.2861
#Subsetting the data by spacegroup
spacegroup194= train[which(train$spacegroup == "194"),]
summary(spacegroup194)
id spacegroup number_of_total_atoms percent_atom_al percent_atom_ga
Min. : 2 12 : 0 10: 13 Min. :0.0000 Min. :0.0000
1st Qu.: 647 33 : 0 20: 0 1st Qu.:0.2188 1st Qu.:0.1250
Median :1302 167: 0 30: 0 Median :0.3750 Median :0.3125
Mean :1252 194:353 40: 0 Mean :0.3905 Mean :0.3249
3rd Qu.:1882 206: 0 60: 0 3rd Qu.:0.5625 3rd Qu.:0.4688
Max. :2395 227: 0 80:340 Max. :1.0000 Max. :1.0000
percent_atom_in lattice_vector_1_ang lattice_vector_2_ang lattice_vector_3_ang
Min. :0.0000 Min. :3.037 Min. :3.037 Min. :11.67
1st Qu.:0.0625 1st Qu.:6.242 1st Qu.:6.242 1st Qu.:23.75
Median :0.2812 Median :6.462 Median :6.462 Median :24.17
Mean :0.2846 Mean :6.349 Mean :6.349 Mean :23.72
3rd Qu.:0.4375 3rd Qu.:6.632 3rd Qu.:6.631 3rd Qu.:24.48
Max. :1.0000 Max. :7.099 Max. :7.098 Max. :25.35
lattice_angle_alpha_degree lattice_angle_beta_degree lattice_angle_gamma_degree
Min. :89.99 Min. :89.98 Min. :120
1st Qu.:90.01 1st Qu.:90.00 1st Qu.:120
Median :90.01 Median :90.00 Median :120
Mean :90.01 Mean :90.00 Mean :120
3rd Qu.:90.02 3rd Qu.:90.01 3rd Qu.:120
Max. :90.02 Max. :90.01 Max. :120
formation_energy_ev_natom bandgap_energy_ev
Min. :0.0156 Min. :0.3717
1st Qu.:0.2214 1st Qu.:1.0964
Median :0.2753 Median :1.5085
Mean :0.2690 Mean :1.7607
3rd Qu.:0.3351 3rd Qu.:2.3692
Max. :0.6572 Max. :4.1785
#Subsetting the data by spacegroup
spacegroup206= train[which(train$spacegroup == "206"),]
summary(spacegroup206)
id spacegroup number_of_total_atoms percent_atom_al percent_atom_ga
Min. : 7.0 12 : 0 10: 0 Min. :0.0000 Min. :0.0000
1st Qu.: 580.8 33 : 0 20: 0 1st Qu.:0.0625 1st Qu.:0.0625
Median :1136.5 167: 0 30: 0 Median :0.3750 Median :0.2812
Mean :1188.7 194: 0 40: 0 Mean :0.3314 Mean :0.2885
3rd Qu.:1798.8 206:490 60: 0 3rd Qu.:0.5000 3rd Qu.:0.4375
Max. :2400.0 227: 0 80:490 Max. :1.0000 Max. :1.0000
percent_atom_in lattice_vector_1_ang lattice_vector_2_ang lattice_vector_3_ang
Min. :0.0000 Min. : 8.985 Min. : 8.984 Min. : 8.984
1st Qu.:0.0938 1st Qu.: 9.297 1st Qu.: 9.297 1st Qu.: 9.297
Median :0.3125 Median : 9.519 Median : 9.519 Median : 9.519
Mean :0.3801 Mean : 9.602 Mean : 9.602 Mean : 9.602
3rd Qu.:0.6562 3rd Qu.: 9.922 3rd Qu.: 9.922 3rd Qu.: 9.922
Max. :1.0000 Max. :10.290 Max. :10.290 Max. :10.291
lattice_angle_alpha_degree lattice_angle_beta_degree lattice_angle_gamma_degree
Min. :90.00 Min. :90.00 Min. :89.99
1st Qu.:90.00 1st Qu.:90.00 1st Qu.:90.00
Median :90.00 Median :90.00 Median :90.00
Mean :90.00 Mean :90.00 Mean :90.00
3rd Qu.:90.00 3rd Qu.:90.00 3rd Qu.:90.00
Max. :90.01 Max. :90.01 Max. :90.00
formation_energy_ev_natom bandgap_energy_ev
Min. :0.00000 Min. :0.7883
1st Qu.:0.09788 1st Qu.:1.2149
Median :0.13180 Median :2.0688
Mean :0.14712 Mean :2.1575
3rd Qu.:0.20450 3rd Qu.:2.9053
Max. :0.50690 Max. :5.2114
#Subsetting the data by spacegroup
spacegroup227= train[which(train$spacegroup == "227"),]
summary(spacegroup227)
id spacegroup number_of_total_atoms percent_atom_al percent_atom_ga
Min. : 3 12 : 0 10: 0 Min. :0.0000 Min. :0.0000
1st Qu.: 608 33 : 0 20: 0 1st Qu.:0.1875 1st Qu.:0.1250
Median :1255 167: 0 30: 0 Median :0.4375 Median :0.2500
Mean :1208 194: 0 40:393 Mean :0.4407 Mean :0.2837
3rd Qu.:1720 206: 0 60: 0 3rd Qu.:0.6250 3rd Qu.:0.4375
Max. :2392 227:393 80: 0 Max. :0.9375 Max. :0.9375
percent_atom_in lattice_vector_1_ang lattice_vector_2_ang lattice_vector_3_ang
Min. :0.0000 Min. : 5.686 Min. :5.625 Min. :13.62
1st Qu.:0.1250 1st Qu.: 6.089 1st Qu.:5.790 1st Qu.:14.20
Median :0.3125 Median : 9.869 Median :5.920 Median :14.46
Mean :0.2756 Mean : 8.630 Mean :5.945 Mean :14.55
3rd Qu.:0.3125 3rd Qu.:10.219 3rd Qu.:6.041 3rd Qu.:14.77
Max. :0.9375 Max. :11.140 Max. :6.545 Max. :16.03
lattice_angle_alpha_degree lattice_angle_beta_degree lattice_angle_gamma_degree
Min. : 82.74 Min. : 81.64 Min. : 29.73
1st Qu.: 90.01 1st Qu.: 90.00 1st Qu.: 30.29
Median : 90.77 Median : 90.90 Median : 30.60
Mean : 91.48 Mean : 91.36 Mean : 63.75
3rd Qu.: 91.42 3rd Qu.: 91.62 3rd Qu.:120.00
Max. :101.23 Max. :101.04 Max. :120.05
formation_energy_ev_natom bandgap_energy_ev
Min. :0.0434 Min. :0.0001
1st Qu.:0.2199 1st Qu.:0.8609
Median :0.3059 Median :1.4436
Mean :0.2900 Mean :1.4767
3rd Qu.:0.3638 3rd Qu.:2.0357
Max. :0.5369 Max. :3.6851
#Investigate the bandgap_energy_ev of diamonds using box plots
ggplot(train,aes(factor(spacegroup),bandgap_energy_ev,fill=spacegroup))+geom_boxplot()
#Investigate the bandgap_energy_ev of diamonds using box plots
ggplot(train,aes(factor(number_of_total_atoms),bandgap_energy_ev,fill=number_of_total_atoms))+geom_boxplot()
#Table
head(sort(table(train$spacegroup),decreasing = T))
206 33 227 167 12 194
490 432 393 374 358 353
#Table
head(sort(table(train$number_of_total_atoms),decreasing = T))
80 40 30 20 60 10
1410 519 326 84 48 13
#mtcars is a data frame
library(corrplot)
train$spacegroup=as.numeric(train$spacegroup)
train$number_of_total_atoms=as.numeric(train$number_of_total_atoms)
t=cor(train)
corrplot(t, order = "AOE")
#Step function
full=lm(bandgap_energy_ev~.-(spacegroup+number_of_total_atoms+id+formation_energy_ev_natom)+factor(spacegroup)+factor(number_of_total_atoms),data=train)
null=lm(bandgap_energy_ev~1,data=train)
step(null, scope=list(lower=null, upper=full),
direction="forward")
Start: AIC=33.75
bandgap_energy_ev ~ 1
Df Sum of Sq RSS AIC
+ percent_atom_in 1 1392.41 1039.5 -2004.04
+ percent_atom_al 1 1326.79 1105.2 -1857.13
+ factor(spacegroup) 5 444.14 1987.8 -440.24
+ factor(number_of_total_atoms) 5 369.82 2062.1 -352.14
+ lattice_vector_3_ang 1 59.84 2372.1 -24.05
+ lattice_vector_1_ang 1 46.92 2385.0 -11.01
+ lattice_angle_gamma_degree 1 43.20 2388.8 -7.27
+ lattice_vector_2_ang 1 39.51 2392.4 -3.57
+ lattice_angle_beta_degree 1 14.44 2417.5 21.46
+ lattice_angle_alpha_degree 1 12.72 2419.2 23.16
<none> 2432.0 33.75
+ percent_atom_ga 1 0.40 2431.6 35.35
Step: AIC=-2004.04
bandgap_energy_ev ~ percent_atom_in
Df Sum of Sq RSS AIC
+ factor(spacegroup) 5 458.27 581.28 -3389.2
+ percent_atom_ga 1 298.04 741.51 -2812.9
+ percent_atom_al 1 298.03 741.51 -2812.9
+ factor(number_of_total_atoms) 5 267.88 771.66 -2709.2
+ lattice_vector_3_ang 1 66.29 973.26 -2160.2
+ lattice_angle_gamma_degree 1 48.39 991.16 -2116.4
+ lattice_vector_2_ang 1 23.42 1016.13 -2056.7
+ lattice_vector_1_ang 1 19.08 1020.46 -2046.5
+ lattice_angle_beta_degree 1 18.66 1020.88 -2045.5
+ lattice_angle_alpha_degree 1 18.58 1020.97 -2045.3
<none> 1039.55 -2004.0
Step: AIC=-3389.18
bandgap_energy_ev ~ percent_atom_in + factor(spacegroup)
Df Sum of Sq RSS AIC
+ percent_atom_ga 1 308.102 273.18 -5199.4
+ percent_atom_al 1 308.097 273.18 -5199.4
+ lattice_vector_2_ang 1 13.506 567.77 -3443.6
+ lattice_vector_3_ang 1 7.650 573.63 -3419.0
+ lattice_angle_beta_degree 1 4.677 576.60 -3406.6
+ lattice_angle_alpha_degree 1 4.143 577.14 -3404.4
+ lattice_angle_gamma_degree 1 2.636 578.64 -3398.1
+ lattice_vector_1_ang 1 1.967 579.31 -3395.3
+ factor(number_of_total_atoms) 4 2.645 578.63 -3392.1
<none> 581.28 -3389.2
Step: AIC=-5199.45
bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + percent_atom_ga
Df Sum of Sq RSS AIC
+ lattice_angle_alpha_degree 1 6.1430 267.04 -5252.0
+ lattice_angle_beta_degree 1 4.5009 268.68 -5237.3
+ lattice_angle_gamma_degree 1 3.6643 269.51 -5229.9
+ factor(number_of_total_atoms) 4 2.8920 270.29 -5217.0
+ lattice_vector_2_ang 1 1.3857 271.79 -5209.6
+ lattice_vector_3_ang 1 0.7584 272.42 -5204.1
+ percent_atom_al 1 0.5535 272.62 -5202.3
+ lattice_vector_1_ang 1 0.2884 272.89 -5200.0
<none> 273.18 -5199.4
Step: AIC=-5252.03
bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + percent_atom_ga +
lattice_angle_alpha_degree
Df Sum of Sq RSS AIC
+ factor(number_of_total_atoms) 4 2.89696 264.14 -5270.2
+ lattice_vector_2_ang 1 1.24573 265.79 -5261.3
+ lattice_angle_gamma_degree 1 1.14354 265.89 -5260.3
+ lattice_angle_beta_degree 1 0.96039 266.07 -5258.7
+ lattice_vector_3_ang 1 0.88835 266.15 -5258.0
+ lattice_vector_1_ang 1 0.69947 266.33 -5256.3
+ percent_atom_al 1 0.54848 266.49 -5255.0
<none> 267.04 -5252.0
Step: AIC=-5270.21
bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + percent_atom_ga +
lattice_angle_alpha_degree + factor(number_of_total_atoms)
Df Sum of Sq RSS AIC
+ lattice_vector_1_ang 1 1.30505 262.83 -5280.1
+ lattice_angle_gamma_degree 1 1.14143 263.00 -5278.6
+ lattice_vector_3_ang 1 1.12189 263.02 -5278.4
+ lattice_angle_beta_degree 1 0.83689 263.30 -5275.8
+ percent_atom_al 1 0.59421 263.54 -5273.6
<none> 264.14 -5270.2
+ lattice_vector_2_ang 1 0.01256 264.12 -5268.3
Step: AIC=-5280.1
bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + percent_atom_ga +
lattice_angle_alpha_degree + factor(number_of_total_atoms) +
lattice_vector_1_ang
Df Sum of Sq RSS AIC
+ lattice_vector_3_ang 1 1.51080 261.32 -5291.9
+ lattice_angle_beta_degree 1 1.10208 261.73 -5288.2
+ percent_atom_al 1 0.57962 262.25 -5283.4
<none> 262.83 -5280.1
+ lattice_angle_gamma_degree 1 0.21830 262.61 -5280.1
+ lattice_vector_2_ang 1 0.00056 262.83 -5278.1
Step: AIC=-5291.93
bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + percent_atom_ga +
lattice_angle_alpha_degree + factor(number_of_total_atoms) +
lattice_vector_1_ang + lattice_vector_3_ang
Df Sum of Sq RSS AIC
+ percent_atom_al 1 0.61941 260.70 -5295.6
+ lattice_angle_beta_degree 1 0.55210 260.77 -5295.0
+ lattice_angle_gamma_degree 1 0.26185 261.06 -5292.3
<none> 261.32 -5291.9
+ lattice_vector_2_ang 1 0.08657 261.24 -5290.7
Step: AIC=-5295.63
bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + percent_atom_ga +
lattice_angle_alpha_degree + factor(number_of_total_atoms) +
lattice_vector_1_ang + lattice_vector_3_ang + percent_atom_al
Df Sum of Sq RSS AIC
+ lattice_angle_beta_degree 1 0.52303 260.18 -5298.4
+ lattice_angle_gamma_degree 1 0.22581 260.48 -5295.7
<none> 260.70 -5295.6
+ lattice_vector_2_ang 1 0.07040 260.63 -5294.3
Step: AIC=-5298.45
bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + percent_atom_ga +
lattice_angle_alpha_degree + factor(number_of_total_atoms) +
lattice_vector_1_ang + lattice_vector_3_ang + percent_atom_al +
lattice_angle_beta_degree
Df Sum of Sq RSS AIC
<none> 260.18 -5298.4
+ lattice_angle_gamma_degree 1 0.017998 260.16 -5296.6
+ lattice_vector_2_ang 1 0.000141 260.18 -5296.4
Call:
lm(formula = bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) +
percent_atom_ga + lattice_angle_alpha_degree + factor(number_of_total_atoms) +
lattice_vector_1_ang + lattice_vector_3_ang + percent_atom_al +
lattice_angle_beta_degree, data = train)
Coefficients:
(Intercept) percent_atom_in
384.16390 -385.76935
factor(spacegroup)2 factor(spacegroup)3
-0.67300 -4.04435
factor(spacegroup)4 factor(spacegroup)5
-4.09774 -0.41638
factor(spacegroup)6 percent_atom_ga
-2.89027 -383.65799
lattice_angle_alpha_degree factor(number_of_total_atoms)2
0.07393 -2.45398
factor(number_of_total_atoms)3 factor(number_of_total_atoms)4
0.22031 -2.66489
factor(number_of_total_atoms)5 factor(number_of_total_atoms)6
NA -2.97470
lattice_vector_1_ang lattice_vector_3_ang
0.03630 0.21098
percent_atom_al lattice_angle_beta_degree
-381.79333 -0.04223
Low AIC model is lm(formula = bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + percent_atom_ga + lattice_angle_alpha_degree + factor(number_of_total_atoms) + lattice_vector_1_ang + lattice_vector_3_ang + percent_atom_al + lattice_angle_beta_degree, data = train)
#Linear Regression model
train$spacegroup=as.factor(train$spacegroup)
train$number_of_total_atoms=as.factor(train$number_of_total_atoms)
m1=lm(formula = bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) +
percent_atom_ga + lattice_angle_alpha_degree + factor(number_of_total_atoms) +
lattice_vector_1_ang + lattice_vector_3_ang + percent_atom_al +
lattice_angle_beta_degree, data = train)
summary(m1)
Call:
lm(formula = bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) +
percent_atom_ga + lattice_angle_alpha_degree + factor(number_of_total_atoms) +
lattice_vector_1_ang + lattice_vector_3_ang + percent_atom_al +
lattice_angle_beta_degree, data = train)
Residuals:
Min 1Q Median 3Q Max
-1.39702 -0.20506 -0.02209 0.20947 1.34884
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 384.16390 164.20613 2.340 0.019391 *
percent_atom_in -385.76935 164.19467 -2.349 0.018882 *
factor(spacegroup)2 -0.67300 0.32377 -2.079 0.037757 *
factor(spacegroup)3 -4.04435 1.32498 -3.052 0.002295 **
factor(spacegroup)4 -4.09774 1.20482 -3.401 0.000682 ***
factor(spacegroup)5 -0.41637 0.32435 -1.284 0.199366
factor(spacegroup)6 -2.89026 0.57916 -4.990 6.46e-07 ***
percent_atom_ga -383.65799 164.19104 -2.337 0.019540 *
lattice_angle_alpha_degree 0.07393 0.02189 3.377 0.000744 ***
factor(number_of_total_atoms)2 -2.45398 0.84491 -2.904 0.003713 **
factor(number_of_total_atoms)3 0.22031 0.06858 3.213 0.001333 **
factor(number_of_total_atoms)4 -2.66489 0.84552 -3.152 0.001643 **
factor(number_of_total_atoms)5 NA NA NA NA
factor(number_of_total_atoms)6 -2.97470 0.85188 -3.492 0.000488 ***
lattice_vector_1_ang 0.03630 0.00889 4.083 4.59e-05 ***
lattice_vector_3_ang 0.21098 0.06967 3.028 0.002488 **
percent_atom_al -381.79332 164.19206 -2.325 0.020140 *
lattice_angle_beta_degree -0.04224 0.01930 -2.189 0.028715 *
---
Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
Residual standard error: 0.3304 on 2383 degrees of freedom
Multiple R-squared: 0.893, Adjusted R-squared: 0.8923
F-statistic: 1243 on 16 and 2383 DF, p-value: < 2.2e-16
#Checking model dignostics
r=residuals(m1)
plot(r,col="green")
Plot of residuals are scattered around. Errors are normaliy distributed.
# Normality test
shapiro.test(r)
Shapiro-Wilk normality test
data: r
W = 0.98739, p-value = 9.859e-14
As w>95%. Normality assumption is satisfied.
#Removing id colunm
test=test[-1,]
test$spacegroup=as.factor(test$spacegroup)
test$number_of_total_atoms=as.factor(test$number_of_total_atoms)
p=predict(m1,test)
Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) :
factor factor(spacegroup) has new levels 12, 33, 167, 194, 206, 227