#Importing the documents 
library(readr)
train <- read_csv("C:/Users/chink/Google Drive/Computational Statistics/Kaggle/Jan2018/train.csv")
Parsed with column specification:
cols(
  id = col_integer(),
  spacegroup = col_integer(),
  number_of_total_atoms = col_integer(),
  percent_atom_al = col_double(),
  percent_atom_ga = col_double(),
  percent_atom_in = col_double(),
  lattice_vector_1_ang = col_double(),
  lattice_vector_2_ang = col_double(),
  lattice_vector_3_ang = col_double(),
  lattice_angle_alpha_degree = col_double(),
  lattice_angle_beta_degree = col_double(),
  lattice_angle_gamma_degree = col_double(),
  formation_energy_ev_natom = col_double(),
  bandgap_energy_ev = col_double()
)
test <- read_csv("C:/Users/chink/Google Drive/Computational Statistics/Kaggle/Jan2018/test.csv")
Parsed with column specification:
cols(
  id = col_integer(),
  spacegroup = col_integer(),
  number_of_total_atoms = col_integer(),
  percent_atom_al = col_double(),
  percent_atom_ga = col_double(),
  percent_atom_in = col_double(),
  lattice_vector_1_ang = col_double(),
  lattice_vector_2_ang = col_double(),
  lattice_vector_3_ang = col_double(),
  lattice_angle_alpha_degree = col_double(),
  lattice_angle_beta_degree = col_double(),
  lattice_angle_gamma_degree = col_double()
)
#Checking for header
head(train)
#Structure of the data
str(train)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   2400 obs. of  14 variables:
 $ id                        : int  1 2 3 4 5 6 7 8 9 10 ...
 $ spacegroup                : int  33 194 227 167 194 227 206 12 206 194 ...
 $ number_of_total_atoms     : int  80 80 40 30 80 40 80 20 80 80 ...
 $ percent_atom_al           : num  0.625 0.625 0.812 0.75 0 ...
 $ percent_atom_ga           : num  0.375 0.375 0.188 0 0.625 ...
 $ percent_atom_in           : num  0 0 0 0.25 0.375 0 0.875 0.5 0.25 0 ...
 $ lattice_vector_1_ang      : num  9.95 6.18 9.75 5 6.66 ...
 $ lattice_vector_2_ang      : num  8.55 6.18 5.66 5 6.66 ...
 $ lattice_vector_3_ang      : num  9.18 23.63 13.96 13.53 24.58 ...
 $ lattice_angle_alpha_degree: num  90 90 91 90 90 ...
 $ lattice_angle_beta_degree : num  90 90 91.1 90 90 ...
 $ lattice_angle_gamma_degree: num  90 120 30.5 120 120 ...
 $ formation_energy_ev_natom : num  0.068 0.249 0.1821 0.2172 0.0505 ...
 $ bandgap_energy_ev         : num  3.44 2.92 2.74 3.35 1.38 ...
 - attr(*, "spec")=List of 2
  ..$ cols   :List of 14
  .. ..$ id                        : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ spacegroup                : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ number_of_total_atoms     : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ percent_atom_al           : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ percent_atom_ga           : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ percent_atom_in           : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ lattice_vector_1_ang      : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ lattice_vector_2_ang      : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ lattice_vector_3_ang      : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ lattice_angle_alpha_degree: list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ lattice_angle_beta_degree : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ lattice_angle_gamma_degree: list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ formation_energy_ev_natom : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ bandgap_energy_ev         : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  ..$ default: list()
  .. ..- attr(*, "class")= chr  "collector_guess" "collector"
  ..- attr(*, "class")= chr "col_spec"
#Structure of test dataset
str(test)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   600 obs. of  12 variables:
 $ id                        : int  1 2 3 4 5 6 7 8 9 10 ...
 $ spacegroup                : int  33 33 167 12 12 33 167 33 33 206 ...
 $ number_of_total_atoms     : int  80 80 30 80 80 40 30 80 80 80 ...
 $ percent_atom_al           : num  0.188 0.75 0.667 0.562 0.188 ...
 $ percent_atom_ga           : num  0.469 0.25 0.167 0.438 0.5 ...
 $ percent_atom_in           : num  0.344 0 0.167 0 0.312 ...
 $ lattice_vector_1_ang      : num  10.54 9.89 4.98 24.34 24.64 ...
 $ lattice_vector_2_ang      : num  9.01 8.5 4.98 6.01 6.29 ...
 $ lattice_vector_3_ang      : num  9.64 9.13 13.48 5.76 6.16 ...
 $ lattice_angle_alpha_degree: num  90 90 90 90 90 ...
 $ lattice_angle_beta_degree : num  90 90 90 104 105 ...
 $ lattice_angle_gamma_degree: num  90 90 120 90 90 ...
 - attr(*, "spec")=List of 2
  ..$ cols   :List of 12
  .. ..$ id                        : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ spacegroup                : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ number_of_total_atoms     : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ percent_atom_al           : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ percent_atom_ga           : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ percent_atom_in           : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ lattice_vector_1_ang      : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ lattice_vector_2_ang      : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ lattice_vector_3_ang      : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ lattice_angle_alpha_degree: list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ lattice_angle_beta_degree : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ lattice_angle_gamma_degree: list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  ..$ default: list()
  .. ..- attr(*, "class")= chr  "collector_guess" "collector"
  ..- attr(*, "class")= chr "col_spec"
#Summary of training and test data
summary(train)
       id           spacegroup    number_of_total_atoms percent_atom_al  percent_atom_ga 
 Min.   :   1.0   Min.   : 12.0   Min.   :10.00         Min.   :0.0000   Min.   :0.0000  
 1st Qu.: 600.8   1st Qu.: 33.0   1st Qu.:40.00         1st Qu.:0.1667   1st Qu.:0.0938  
 Median :1200.5   Median :194.0   Median :80.00         Median :0.3750   Median :0.2812  
 Mean   :1200.5   Mean   :141.5   Mean   :61.68         Mean   :0.3854   Mean   :0.3086  
 3rd Qu.:1800.2   3rd Qu.:206.0   3rd Qu.:80.00         3rd Qu.:0.5833   3rd Qu.:0.4688  
 Max.   :2400.0   Max.   :227.0   Max.   :80.00         Max.   :1.0000   Max.   :1.0000  
 percent_atom_in  lattice_vector_1_ang lattice_vector_2_ang lattice_vector_3_ang
 Min.   :0.0000   Min.   : 3.037       Min.   : 2.942       Min.   : 5.673      
 1st Qu.:0.0625   1st Qu.: 6.141       1st Qu.: 5.834       1st Qu.: 9.298      
 Median :0.2500   Median : 9.537       Median : 6.383       Median :10.125      
 Mean   :0.3060   Mean   :10.030       Mean   : 7.087       Mean   :12.593      
 3rd Qu.:0.4688   3rd Qu.:10.292       3rd Qu.: 9.093       3rd Qu.:14.372      
 Max.   :1.0000   Max.   :24.913       Max.   :10.290       Max.   :25.346      
 lattice_angle_alpha_degree lattice_angle_beta_degree lattice_angle_gamma_degree
 Min.   : 82.74             Min.   : 81.64            Min.   : 29.73            
 1st Qu.: 90.00             1st Qu.: 90.00            1st Qu.: 90.00            
 Median : 90.00             Median : 90.00            Median : 90.00            
 Mean   : 90.24             Mean   : 92.40            Mean   : 94.79            
 3rd Qu.: 90.01             3rd Qu.: 90.01            3rd Qu.:120.00            
 Max.   :101.23             Max.   :106.17            Max.   :120.05            
 formation_energy_ev_natom bandgap_energy_ev
 Min.   :0.0000            Min.   :0.0001   
 1st Qu.:0.1056            1st Qu.:1.2785   
 Median :0.1818            Median :1.9079   
 Mean   :0.1876            Mean   :2.0772   
 3rd Qu.:0.2563            3rd Qu.:2.7620   
 Max.   :0.6572            Max.   :5.2861   
summary(test)
       id          spacegroup    number_of_total_atoms percent_atom_al  percent_atom_ga 
 Min.   :  1.0   Min.   : 12.0   Min.   :10.00         Min.   :0.0000   Min.   :0.0000  
 1st Qu.:150.8   1st Qu.: 33.0   1st Qu.:40.00         1st Qu.:0.1250   1st Qu.:0.0938  
 Median :300.5   Median :194.0   Median :80.00         Median :0.3750   Median :0.2500  
 Mean   :300.5   Mean   :139.6   Mean   :61.73         Mean   :0.3710   Mean   :0.3133  
 3rd Qu.:450.2   3rd Qu.:206.0   3rd Qu.:80.00         3rd Qu.:0.5625   3rd Qu.:0.4688  
 Max.   :600.0   Max.   :227.0   Max.   :80.00         Max.   :1.0000   Max.   :0.9688  
 percent_atom_in  lattice_vector_1_ang lattice_vector_2_ang lattice_vector_3_ang
 Min.   :0.0000   Min.   : 3.073       Min.   : 2.960       Min.   : 5.698      
 1st Qu.:0.0625   1st Qu.: 6.137       1st Qu.: 5.829       1st Qu.: 9.309      
 Median :0.2812   Median : 9.495       Median : 6.398       Median :10.097      
 Mean   :0.3157   Mean   :10.098       Mean   : 7.082       Mean   :12.442      
 3rd Qu.:0.4688   3rd Qu.:10.363       3rd Qu.: 9.157       3rd Qu.:14.328      
 Max.   :0.9688   Max.   :24.913       Max.   :10.249       Max.   :25.306      
 lattice_angle_alpha_degree lattice_angle_beta_degree lattice_angle_gamma_degree
 Min.   : 83.74             Min.   : 82.75            Min.   : 29.72            
 1st Qu.: 90.00             1st Qu.: 90.00            1st Qu.: 90.00            
 Median : 90.00             Median : 90.00            Median : 90.00            
 Mean   : 90.16             Mean   : 92.49            Mean   : 96.33            
 3rd Qu.: 90.01             3rd Qu.: 90.01            3rd Qu.:120.00            
 Max.   :100.95             Max.   :105.98            Max.   :120.05            
#Checking for missing value
library(Amelia)
missmap(train)
the condition has length > 1 and only the first element will be usedUnknown or uninitialised column: 'arguments'.Unknown or uninitialised column: 'arguments'.Unknown or uninitialised column: 'imputations'.

From the plot, we can observe that no missing information in training dataset.

#Checking for missing data in test
library(Amelia)
missmap(test)
the condition has length > 1 and only the first element will be usedUnknown or uninitialised column: 'arguments'.Unknown or uninitialised column: 'arguments'.Unknown or uninitialised column: 'imputations'.

No missing data.

library(gridExtra)
library(ggplot2)
p1=ggplot(train,aes(x=percent_atom_al))+geom_histogram(col="red",aes(fill=..count..))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for percent_atom_al")
p2=ggplot(train,aes(x=percent_atom_ga))+geom_histogram(col="red",aes(fill=..count..))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for percent_atom_ga")
p3=ggplot(train,aes(x=percent_atom_in))+geom_histogram(col="red",aes(fill=..count..))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for percent_atom_in")
p4=ggplot(train,aes(x=lattice_vector_1_ang))+geom_histogram(col="red",aes(fill=..count..))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for lattice_vector_1_ang")
p5=ggplot(train,aes(x=lattice_vector_2_ang))+geom_histogram(col="red",aes(fill=..count..))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for lattice_vector_2_ang")
p6=ggplot(train,aes(x=lattice_vector_3_ang))+geom_histogram(col="red",aes(fill=..count..))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for lattice_vector_3_ang")
p7=ggplot(train,aes(x=lattice_angle_alpha_degree))+geom_histogram(col="red",aes(fill=..count..),breaks=seq(80,110,by=1))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for lattice_angle_alpha_degree")
p8=ggplot(train,aes(x=lattice_angle_beta_degree))+geom_histogram(col="red",aes(fill=..count..),breaks=seq(80,110,by=1))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for lattice_angle_beta_degree")
p9=ggplot(train,aes(x=lattice_angle_gamma_degree))+geom_histogram(col="red",aes(fill=..count..),breaks=seq(80,110,by=1))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for lattice_angle_gamma_degree")
p10=ggplot(train,aes(x=formation_energy_ev_natom))+geom_histogram(col="red",aes(fill=..count..))+ scale_fill_gradient("Count", low = "green", high = "blue")+labs(title="Histogram for formation_energy_ev_natom")
grid.arrange(p1,p2,p3,p4,ncol=2,nrow=2)

grid.arrange(p5,p6,ncol=2,nrow=1)

grid.arrange(p7,p8,ncol=2,nrow=1)

grid.arrange(p9,p10,ncol=2,nrow=1)

#Converting variables as factor 
train$spacegroup=as.factor(train$spacegroup)
train$number_of_total_atoms=as.factor(train$number_of_total_atoms)
#Scatterplot of varaibles
s1=ggplot(train,aes(percent_atom_al,percent_atom_ga,col=spacegroup))+geom_point()+geom_smooth()+facet_wrap(~number_of_total_atoms)
s2=ggplot(train,aes(percent_atom_al,percent_atom_in,col=spacegroup))+geom_point()+geom_smooth()+facet_wrap(~number_of_total_atoms)
s3=ggplot(train,aes(percent_atom_ga,percent_atom_in,col=spacegroup))+geom_point()+geom_smooth()+facet_wrap(~number_of_total_atoms)
s1

s2

s3

#Plot
s4=ggplot(train,aes(percent_atom_ga,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s4

#plots
s5=ggplot(train,aes(percent_atom_al,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s5

#plots
s6=ggplot(train,aes(percent_atom_in,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s6

#plots
s7=ggplot(train,aes(lattice_vector_1_ang,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s7

#plots
s8=ggplot(train,aes(lattice_vector_2_ang,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s8

#plots
s9=ggplot(train,aes(lattice_vector_3_ang,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s9

#plots
s10=ggplot(train,aes(lattice_angle_alpha_degree,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s10

#plots
s11=ggplot(train,aes(lattice_angle_beta_degree,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s11

#plots
s12=ggplot(train,aes(lattice_angle_gamma_degree,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s12

#plots
s13=ggplot(train,aes(formation_energy_ev_natom,bandgap_energy_ev,col=spacegroup))+geom_point()+facet_wrap(~number_of_total_atoms)
s13

#Subsetting the data by spacegroup
spacegroup33= train[which(train$spacegroup == "33"),]
summary(spacegroup33)
       id         spacegroup number_of_total_atoms percent_atom_al  percent_atom_ga 
 Min.   :   1.0   12 :  0    10:  0                Min.   :0.0000   Min.   :0.0000  
 1st Qu.: 519.8   33 :432    20:  0                1st Qu.:0.1562   1st Qu.:0.0625  
 Median :1156.5   167:  0    30:  0                Median :0.4062   Median :0.1875  
 Mean   :1153.0   194:  0    40:126                Mean   :0.4063   Mean   :0.2629  
 3rd Qu.:1722.5   206:  0    60:  0                3rd Qu.:0.6250   3rd Qu.:0.4062  
 Max.   :2399.0   227:  0    80:306                Max.   :0.9688   Max.   :1.0000  
 percent_atom_in  lattice_vector_1_ang lattice_vector_2_ang lattice_vector_3_ang
 Min.   :0.0000   Min.   : 4.903       Min.   :8.414        Min.   : 9.046      
 1st Qu.:0.0938   1st Qu.: 5.529       1st Qu.:8.651        1st Qu.: 9.277      
 Median :0.2500   Median :10.137       Median :8.881        Median : 9.505      
 Mean   :0.3308   Mean   : 8.889       Mean   :8.916        Mean   : 9.542      
 3rd Qu.:0.5312   3rd Qu.:10.481       3rd Qu.:9.168        3rd Qu.: 9.787      
 Max.   :1.0000   Max.   :11.284       Max.   :9.640        Max.   :10.264      
 lattice_angle_alpha_degree lattice_angle_beta_degree lattice_angle_gamma_degree
 Min.   :90.00              Min.   :90                Min.   :90                
 1st Qu.:90.00              1st Qu.:90                1st Qu.:90                
 Median :90.00              Median :90                Median :90                
 Mean   :90.00              Mean   :90                Mean   :90                
 3rd Qu.:90.00              3rd Qu.:90                3rd Qu.:90                
 Max.   :90.01              Max.   :90                Max.   :90                
 formation_energy_ev_natom bandgap_energy_ev
 Min.   :0.0254            Min.   :0.6522   
 1st Qu.:0.1109            1st Qu.:1.3665   
 Median :0.1779            Median :2.0789   
 Mean   :0.1692            Mean   :2.1968   
 3rd Qu.:0.2308            3rd Qu.:2.9715   
 Max.   :0.3520            Max.   :4.6795   
#Subsetting the data by spacegroup
spacegroup12= train[which(train$spacegroup == "12"),]
summary(spacegroup12)
       id         spacegroup number_of_total_atoms percent_atom_al  percent_atom_ga
 Min.   :   8.0   12 :358    10:  0                Min.   :0.0000   Min.   :0.000  
 1st Qu.: 571.2   33 :  0    20: 84                1st Qu.:0.1562   1st Qu.:0.250  
 Median :1089.5   167:  0    30:  0                Median :0.3438   Median :0.375  
 Mean   :1150.7   194:  0    40:  0                Mean   :0.3186   Mean   :0.387  
 3rd Qu.:1804.8   206:  0    60:  0                3rd Qu.:0.4375   3rd Qu.:0.500  
 Max.   :2393.0   227:  0    80:274                Max.   :1.0000   Max.   :1.000  
 percent_atom_in  lattice_vector_1_ang lattice_vector_2_ang lattice_vector_3_ang
 Min.   :0.0000   Min.   :11.94        Min.   :2.942        Min.   :5.673       
 1st Qu.:0.1250   1st Qu.:24.09        1st Qu.:5.942        1st Qu.:5.949       
 Median :0.2812   Median :24.44        Median :6.212        Median :6.108       
 Mean   :0.2944   Mean   :21.64        Mean   :5.517        Mean   :6.114       
 3rd Qu.:0.4062   3rd Qu.:24.60        3rd Qu.:6.299        3rd Qu.:6.241       
 Max.   :1.0000   Max.   :24.91        Max.   :6.676        Max.   :6.905       
 lattice_angle_alpha_degree lattice_angle_beta_degree lattice_angle_gamma_degree
 Min.   :90                 Min.   :103.7             Min.   :90                
 1st Qu.:90                 1st Qu.:104.2             1st Qu.:90                
 Median :90                 Median :104.6             Median :90                
 Mean   :90                 Mean   :104.6             Mean   :90                
 3rd Qu.:90                 3rd Qu.:104.9             3rd Qu.:90                
 Max.   :90                 Max.   :106.2             Max.   :90                
 formation_energy_ev_natom bandgap_energy_ev
 Min.   :0.00000           Min.   :0.722    
 1st Qu.:0.04295           1st Qu.:1.481    
 Median :0.12565           Median :1.857    
 Mean   :0.12642           Mean   :1.939    
 3rd Qu.:0.17985           3rd Qu.:2.251    
 Max.   :0.40460           Max.   :4.459    
#Subsetting the data by spacegroup
spacegroup167= train[which(train$spacegroup == "167"),]
summary(spacegroup167)
       id         spacegroup number_of_total_atoms percent_atom_al  percent_atom_ga 
 Min.   :   4.0   12 :  0    10:  0                Min.   :0.0000   Min.   :0.0000  
 1st Qu.: 701.5   33 :  0    20:  0                1st Qu.:0.2500   1st Qu.:0.0833  
 Median :1271.5   167:374    30:326                Median :0.4167   Median :0.3333  
 Mean   :1262.1   194:  0    40:  0                Mean   :0.4334   Mean   :0.3234  
 3rd Qu.:1859.2   206:  0    60: 48                3rd Qu.:0.6667   3rd Qu.:0.5000  
 Max.   :2397.0   227:  0    80:  0                Max.   :0.9167   Max.   :1.0000  
 percent_atom_in  lattice_vector_1_ang lattice_vector_2_ang lattice_vector_3_ang
 Min.   :0.0000   Min.   : 4.833       Min.   :4.833        Min.   :13.17       
 1st Qu.:0.0000   1st Qu.: 4.957       1st Qu.:4.936        1st Qu.:13.38       
 Median :0.1667   Median : 5.067       Median :5.024        Median :13.57       
 Mean   :0.2432   Mean   : 5.737       Mean   :5.078        Mean   :13.68       
 3rd Qu.:0.4167   3rd Qu.: 5.278       3rd Qu.:5.193        3rd Qu.:13.93       
 Max.   :1.0000   Max.   :11.070       Max.   :5.578        Max.   :14.75       
 lattice_angle_alpha_degree lattice_angle_beta_degree lattice_angle_gamma_degree
 Min.   :89.98              Min.   :90.00             Min.   :120               
 1st Qu.:89.99              1st Qu.:90.00             1st Qu.:120               
 Median :89.99              Median :90.01             Median :120               
 Mean   :89.99              Mean   :90.01             Mean   :120               
 3rd Qu.:90.00              3rd Qu.:90.01             3rd Qu.:120               
 Max.   :90.00              Max.   :90.02             Max.   :120               
 formation_energy_ev_natom bandgap_energy_ev
 Min.   :0.01500           Min.   :0.9439   
 1st Qu.:0.06872           1st Qu.:1.9798   
 Median :0.13315           Median :2.9968   
 Mean   :0.13614           Mean   :2.8962   
 3rd Qu.:0.19027           3rd Qu.:3.7161   
 Max.   :0.35070           Max.   :5.2861   
#Subsetting the data by spacegroup
spacegroup194= train[which(train$spacegroup == "194"),]
summary(spacegroup194)
       id       spacegroup number_of_total_atoms percent_atom_al  percent_atom_ga 
 Min.   :   2   12 :  0    10: 13                Min.   :0.0000   Min.   :0.0000  
 1st Qu.: 647   33 :  0    20:  0                1st Qu.:0.2188   1st Qu.:0.1250  
 Median :1302   167:  0    30:  0                Median :0.3750   Median :0.3125  
 Mean   :1252   194:353    40:  0                Mean   :0.3905   Mean   :0.3249  
 3rd Qu.:1882   206:  0    60:  0                3rd Qu.:0.5625   3rd Qu.:0.4688  
 Max.   :2395   227:  0    80:340                Max.   :1.0000   Max.   :1.0000  
 percent_atom_in  lattice_vector_1_ang lattice_vector_2_ang lattice_vector_3_ang
 Min.   :0.0000   Min.   :3.037        Min.   :3.037        Min.   :11.67       
 1st Qu.:0.0625   1st Qu.:6.242        1st Qu.:6.242        1st Qu.:23.75       
 Median :0.2812   Median :6.462        Median :6.462        Median :24.17       
 Mean   :0.2846   Mean   :6.349        Mean   :6.349        Mean   :23.72       
 3rd Qu.:0.4375   3rd Qu.:6.632        3rd Qu.:6.631        3rd Qu.:24.48       
 Max.   :1.0000   Max.   :7.099        Max.   :7.098        Max.   :25.35       
 lattice_angle_alpha_degree lattice_angle_beta_degree lattice_angle_gamma_degree
 Min.   :89.99              Min.   :89.98             Min.   :120               
 1st Qu.:90.01              1st Qu.:90.00             1st Qu.:120               
 Median :90.01              Median :90.00             Median :120               
 Mean   :90.01              Mean   :90.00             Mean   :120               
 3rd Qu.:90.02              3rd Qu.:90.01             3rd Qu.:120               
 Max.   :90.02              Max.   :90.01             Max.   :120               
 formation_energy_ev_natom bandgap_energy_ev
 Min.   :0.0156            Min.   :0.3717   
 1st Qu.:0.2214            1st Qu.:1.0964   
 Median :0.2753            Median :1.5085   
 Mean   :0.2690            Mean   :1.7607   
 3rd Qu.:0.3351            3rd Qu.:2.3692   
 Max.   :0.6572            Max.   :4.1785   
#Subsetting the data by spacegroup
spacegroup206= train[which(train$spacegroup == "206"),]
summary(spacegroup206)
       id         spacegroup number_of_total_atoms percent_atom_al  percent_atom_ga 
 Min.   :   7.0   12 :  0    10:  0                Min.   :0.0000   Min.   :0.0000  
 1st Qu.: 580.8   33 :  0    20:  0                1st Qu.:0.0625   1st Qu.:0.0625  
 Median :1136.5   167:  0    30:  0                Median :0.3750   Median :0.2812  
 Mean   :1188.7   194:  0    40:  0                Mean   :0.3314   Mean   :0.2885  
 3rd Qu.:1798.8   206:490    60:  0                3rd Qu.:0.5000   3rd Qu.:0.4375  
 Max.   :2400.0   227:  0    80:490                Max.   :1.0000   Max.   :1.0000  
 percent_atom_in  lattice_vector_1_ang lattice_vector_2_ang lattice_vector_3_ang
 Min.   :0.0000   Min.   : 8.985       Min.   : 8.984       Min.   : 8.984      
 1st Qu.:0.0938   1st Qu.: 9.297       1st Qu.: 9.297       1st Qu.: 9.297      
 Median :0.3125   Median : 9.519       Median : 9.519       Median : 9.519      
 Mean   :0.3801   Mean   : 9.602       Mean   : 9.602       Mean   : 9.602      
 3rd Qu.:0.6562   3rd Qu.: 9.922       3rd Qu.: 9.922       3rd Qu.: 9.922      
 Max.   :1.0000   Max.   :10.290       Max.   :10.290       Max.   :10.291      
 lattice_angle_alpha_degree lattice_angle_beta_degree lattice_angle_gamma_degree
 Min.   :90.00              Min.   :90.00             Min.   :89.99             
 1st Qu.:90.00              1st Qu.:90.00             1st Qu.:90.00             
 Median :90.00              Median :90.00             Median :90.00             
 Mean   :90.00              Mean   :90.00             Mean   :90.00             
 3rd Qu.:90.00              3rd Qu.:90.00             3rd Qu.:90.00             
 Max.   :90.01              Max.   :90.01             Max.   :90.00             
 formation_energy_ev_natom bandgap_energy_ev
 Min.   :0.00000           Min.   :0.7883   
 1st Qu.:0.09788           1st Qu.:1.2149   
 Median :0.13180           Median :2.0688   
 Mean   :0.14712           Mean   :2.1575   
 3rd Qu.:0.20450           3rd Qu.:2.9053   
 Max.   :0.50690           Max.   :5.2114   
#Subsetting the data by spacegroup
spacegroup227= train[which(train$spacegroup == "227"),]
summary(spacegroup227)
       id       spacegroup number_of_total_atoms percent_atom_al  percent_atom_ga 
 Min.   :   3   12 :  0    10:  0                Min.   :0.0000   Min.   :0.0000  
 1st Qu.: 608   33 :  0    20:  0                1st Qu.:0.1875   1st Qu.:0.1250  
 Median :1255   167:  0    30:  0                Median :0.4375   Median :0.2500  
 Mean   :1208   194:  0    40:393                Mean   :0.4407   Mean   :0.2837  
 3rd Qu.:1720   206:  0    60:  0                3rd Qu.:0.6250   3rd Qu.:0.4375  
 Max.   :2392   227:393    80:  0                Max.   :0.9375   Max.   :0.9375  
 percent_atom_in  lattice_vector_1_ang lattice_vector_2_ang lattice_vector_3_ang
 Min.   :0.0000   Min.   : 5.686       Min.   :5.625        Min.   :13.62       
 1st Qu.:0.1250   1st Qu.: 6.089       1st Qu.:5.790        1st Qu.:14.20       
 Median :0.3125   Median : 9.869       Median :5.920        Median :14.46       
 Mean   :0.2756   Mean   : 8.630       Mean   :5.945        Mean   :14.55       
 3rd Qu.:0.3125   3rd Qu.:10.219       3rd Qu.:6.041        3rd Qu.:14.77       
 Max.   :0.9375   Max.   :11.140       Max.   :6.545        Max.   :16.03       
 lattice_angle_alpha_degree lattice_angle_beta_degree lattice_angle_gamma_degree
 Min.   : 82.74             Min.   : 81.64            Min.   : 29.73            
 1st Qu.: 90.01             1st Qu.: 90.00            1st Qu.: 30.29            
 Median : 90.77             Median : 90.90            Median : 30.60            
 Mean   : 91.48             Mean   : 91.36            Mean   : 63.75            
 3rd Qu.: 91.42             3rd Qu.: 91.62            3rd Qu.:120.00            
 Max.   :101.23             Max.   :101.04            Max.   :120.05            
 formation_energy_ev_natom bandgap_energy_ev
 Min.   :0.0434            Min.   :0.0001   
 1st Qu.:0.2199            1st Qu.:0.8609   
 Median :0.3059            Median :1.4436   
 Mean   :0.2900            Mean   :1.4767   
 3rd Qu.:0.3638            3rd Qu.:2.0357   
 Max.   :0.5369            Max.   :3.6851   
#Investigate the bandgap_energy_ev of diamonds using box plots
ggplot(train,aes(factor(spacegroup),bandgap_energy_ev,fill=spacegroup))+geom_boxplot()

#Investigate the bandgap_energy_ev of diamonds using box plots
ggplot(train,aes(factor(number_of_total_atoms),bandgap_energy_ev,fill=number_of_total_atoms))+geom_boxplot()

#Table
head(sort(table(train$spacegroup),decreasing = T))

206  33 227 167  12 194 
490 432 393 374 358 353 
#Table
head(sort(table(train$number_of_total_atoms),decreasing = T))

  80   40   30   20   60   10 
1410  519  326   84   48   13 
#mtcars is a data frame 
library(corrplot)
train$spacegroup=as.numeric(train$spacegroup)
train$number_of_total_atoms=as.numeric(train$number_of_total_atoms)
t=cor(train)
corrplot(t, order = "AOE")

#Step function
full=lm(bandgap_energy_ev~.-(spacegroup+number_of_total_atoms+id+formation_energy_ev_natom)+factor(spacegroup)+factor(number_of_total_atoms),data=train)
null=lm(bandgap_energy_ev~1,data=train)
step(null, scope=list(lower=null, upper=full),
direction="forward")
Start:  AIC=33.75
bandgap_energy_ev ~ 1

                                Df Sum of Sq    RSS      AIC
+ percent_atom_in                1   1392.41 1039.5 -2004.04
+ percent_atom_al                1   1326.79 1105.2 -1857.13
+ factor(spacegroup)             5    444.14 1987.8  -440.24
+ factor(number_of_total_atoms)  5    369.82 2062.1  -352.14
+ lattice_vector_3_ang           1     59.84 2372.1   -24.05
+ lattice_vector_1_ang           1     46.92 2385.0   -11.01
+ lattice_angle_gamma_degree     1     43.20 2388.8    -7.27
+ lattice_vector_2_ang           1     39.51 2392.4    -3.57
+ lattice_angle_beta_degree      1     14.44 2417.5    21.46
+ lattice_angle_alpha_degree     1     12.72 2419.2    23.16
<none>                                       2432.0    33.75
+ percent_atom_ga                1      0.40 2431.6    35.35

Step:  AIC=-2004.04
bandgap_energy_ev ~ percent_atom_in

                                Df Sum of Sq     RSS     AIC
+ factor(spacegroup)             5    458.27  581.28 -3389.2
+ percent_atom_ga                1    298.04  741.51 -2812.9
+ percent_atom_al                1    298.03  741.51 -2812.9
+ factor(number_of_total_atoms)  5    267.88  771.66 -2709.2
+ lattice_vector_3_ang           1     66.29  973.26 -2160.2
+ lattice_angle_gamma_degree     1     48.39  991.16 -2116.4
+ lattice_vector_2_ang           1     23.42 1016.13 -2056.7
+ lattice_vector_1_ang           1     19.08 1020.46 -2046.5
+ lattice_angle_beta_degree      1     18.66 1020.88 -2045.5
+ lattice_angle_alpha_degree     1     18.58 1020.97 -2045.3
<none>                                       1039.55 -2004.0

Step:  AIC=-3389.18
bandgap_energy_ev ~ percent_atom_in + factor(spacegroup)

                                Df Sum of Sq    RSS     AIC
+ percent_atom_ga                1   308.102 273.18 -5199.4
+ percent_atom_al                1   308.097 273.18 -5199.4
+ lattice_vector_2_ang           1    13.506 567.77 -3443.6
+ lattice_vector_3_ang           1     7.650 573.63 -3419.0
+ lattice_angle_beta_degree      1     4.677 576.60 -3406.6
+ lattice_angle_alpha_degree     1     4.143 577.14 -3404.4
+ lattice_angle_gamma_degree     1     2.636 578.64 -3398.1
+ lattice_vector_1_ang           1     1.967 579.31 -3395.3
+ factor(number_of_total_atoms)  4     2.645 578.63 -3392.1
<none>                                       581.28 -3389.2

Step:  AIC=-5199.45
bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + percent_atom_ga

                                Df Sum of Sq    RSS     AIC
+ lattice_angle_alpha_degree     1    6.1430 267.04 -5252.0
+ lattice_angle_beta_degree      1    4.5009 268.68 -5237.3
+ lattice_angle_gamma_degree     1    3.6643 269.51 -5229.9
+ factor(number_of_total_atoms)  4    2.8920 270.29 -5217.0
+ lattice_vector_2_ang           1    1.3857 271.79 -5209.6
+ lattice_vector_3_ang           1    0.7584 272.42 -5204.1
+ percent_atom_al                1    0.5535 272.62 -5202.3
+ lattice_vector_1_ang           1    0.2884 272.89 -5200.0
<none>                                       273.18 -5199.4

Step:  AIC=-5252.03
bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + percent_atom_ga + 
    lattice_angle_alpha_degree

                                Df Sum of Sq    RSS     AIC
+ factor(number_of_total_atoms)  4   2.89696 264.14 -5270.2
+ lattice_vector_2_ang           1   1.24573 265.79 -5261.3
+ lattice_angle_gamma_degree     1   1.14354 265.89 -5260.3
+ lattice_angle_beta_degree      1   0.96039 266.07 -5258.7
+ lattice_vector_3_ang           1   0.88835 266.15 -5258.0
+ lattice_vector_1_ang           1   0.69947 266.33 -5256.3
+ percent_atom_al                1   0.54848 266.49 -5255.0
<none>                                       267.04 -5252.0

Step:  AIC=-5270.21
bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + percent_atom_ga + 
    lattice_angle_alpha_degree + factor(number_of_total_atoms)

                             Df Sum of Sq    RSS     AIC
+ lattice_vector_1_ang        1   1.30505 262.83 -5280.1
+ lattice_angle_gamma_degree  1   1.14143 263.00 -5278.6
+ lattice_vector_3_ang        1   1.12189 263.02 -5278.4
+ lattice_angle_beta_degree   1   0.83689 263.30 -5275.8
+ percent_atom_al             1   0.59421 263.54 -5273.6
<none>                                    264.14 -5270.2
+ lattice_vector_2_ang        1   0.01256 264.12 -5268.3

Step:  AIC=-5280.1
bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + percent_atom_ga + 
    lattice_angle_alpha_degree + factor(number_of_total_atoms) + 
    lattice_vector_1_ang

                             Df Sum of Sq    RSS     AIC
+ lattice_vector_3_ang        1   1.51080 261.32 -5291.9
+ lattice_angle_beta_degree   1   1.10208 261.73 -5288.2
+ percent_atom_al             1   0.57962 262.25 -5283.4
<none>                                    262.83 -5280.1
+ lattice_angle_gamma_degree  1   0.21830 262.61 -5280.1
+ lattice_vector_2_ang        1   0.00056 262.83 -5278.1

Step:  AIC=-5291.93
bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + percent_atom_ga + 
    lattice_angle_alpha_degree + factor(number_of_total_atoms) + 
    lattice_vector_1_ang + lattice_vector_3_ang

                             Df Sum of Sq    RSS     AIC
+ percent_atom_al             1   0.61941 260.70 -5295.6
+ lattice_angle_beta_degree   1   0.55210 260.77 -5295.0
+ lattice_angle_gamma_degree  1   0.26185 261.06 -5292.3
<none>                                    261.32 -5291.9
+ lattice_vector_2_ang        1   0.08657 261.24 -5290.7

Step:  AIC=-5295.63
bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + percent_atom_ga + 
    lattice_angle_alpha_degree + factor(number_of_total_atoms) + 
    lattice_vector_1_ang + lattice_vector_3_ang + percent_atom_al

                             Df Sum of Sq    RSS     AIC
+ lattice_angle_beta_degree   1   0.52303 260.18 -5298.4
+ lattice_angle_gamma_degree  1   0.22581 260.48 -5295.7
<none>                                    260.70 -5295.6
+ lattice_vector_2_ang        1   0.07040 260.63 -5294.3

Step:  AIC=-5298.45
bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + percent_atom_ga + 
    lattice_angle_alpha_degree + factor(number_of_total_atoms) + 
    lattice_vector_1_ang + lattice_vector_3_ang + percent_atom_al + 
    lattice_angle_beta_degree

                             Df Sum of Sq    RSS     AIC
<none>                                    260.18 -5298.4
+ lattice_angle_gamma_degree  1  0.017998 260.16 -5296.6
+ lattice_vector_2_ang        1  0.000141 260.18 -5296.4

Call:
lm(formula = bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + 
    percent_atom_ga + lattice_angle_alpha_degree + factor(number_of_total_atoms) + 
    lattice_vector_1_ang + lattice_vector_3_ang + percent_atom_al + 
    lattice_angle_beta_degree, data = train)

Coefficients:
                   (Intercept)                 percent_atom_in  
                     384.16390                      -385.76935  
           factor(spacegroup)2             factor(spacegroup)3  
                      -0.67300                        -4.04435  
           factor(spacegroup)4             factor(spacegroup)5  
                      -4.09774                        -0.41638  
           factor(spacegroup)6                 percent_atom_ga  
                      -2.89027                      -383.65799  
    lattice_angle_alpha_degree  factor(number_of_total_atoms)2  
                       0.07393                        -2.45398  
factor(number_of_total_atoms)3  factor(number_of_total_atoms)4  
                       0.22031                        -2.66489  
factor(number_of_total_atoms)5  factor(number_of_total_atoms)6  
                            NA                        -2.97470  
          lattice_vector_1_ang            lattice_vector_3_ang  
                       0.03630                         0.21098  
               percent_atom_al       lattice_angle_beta_degree  
                    -381.79333                        -0.04223  

Low AIC model is lm(formula = bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + percent_atom_ga + lattice_angle_alpha_degree + factor(number_of_total_atoms) + lattice_vector_1_ang + lattice_vector_3_ang + percent_atom_al + lattice_angle_beta_degree, data = train)

#Linear Regression model
train$spacegroup=as.factor(train$spacegroup)
train$number_of_total_atoms=as.factor(train$number_of_total_atoms)
m1=lm(formula = bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + 
    percent_atom_ga + lattice_angle_alpha_degree + factor(number_of_total_atoms) + 
    lattice_vector_1_ang + lattice_vector_3_ang + percent_atom_al + 
    lattice_angle_beta_degree, data = train)
summary(m1)

Call:
lm(formula = bandgap_energy_ev ~ percent_atom_in + factor(spacegroup) + 
    percent_atom_ga + lattice_angle_alpha_degree + factor(number_of_total_atoms) + 
    lattice_vector_1_ang + lattice_vector_3_ang + percent_atom_al + 
    lattice_angle_beta_degree, data = train)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.39702 -0.20506 -0.02209  0.20947  1.34884 

Coefficients: (1 not defined because of singularities)
                                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)                     384.16390  164.20613   2.340 0.019391 *  
percent_atom_in                -385.76935  164.19467  -2.349 0.018882 *  
factor(spacegroup)2              -0.67300    0.32377  -2.079 0.037757 *  
factor(spacegroup)3              -4.04435    1.32498  -3.052 0.002295 ** 
factor(spacegroup)4              -4.09774    1.20482  -3.401 0.000682 ***
factor(spacegroup)5              -0.41637    0.32435  -1.284 0.199366    
factor(spacegroup)6              -2.89026    0.57916  -4.990 6.46e-07 ***
percent_atom_ga                -383.65799  164.19104  -2.337 0.019540 *  
lattice_angle_alpha_degree        0.07393    0.02189   3.377 0.000744 ***
factor(number_of_total_atoms)2   -2.45398    0.84491  -2.904 0.003713 ** 
factor(number_of_total_atoms)3    0.22031    0.06858   3.213 0.001333 ** 
factor(number_of_total_atoms)4   -2.66489    0.84552  -3.152 0.001643 ** 
factor(number_of_total_atoms)5         NA         NA      NA       NA    
factor(number_of_total_atoms)6   -2.97470    0.85188  -3.492 0.000488 ***
lattice_vector_1_ang              0.03630    0.00889   4.083 4.59e-05 ***
lattice_vector_3_ang              0.21098    0.06967   3.028 0.002488 ** 
percent_atom_al                -381.79332  164.19206  -2.325 0.020140 *  
lattice_angle_beta_degree        -0.04224    0.01930  -2.189 0.028715 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3304 on 2383 degrees of freedom
Multiple R-squared:  0.893, Adjusted R-squared:  0.8923 
F-statistic:  1243 on 16 and 2383 DF,  p-value: < 2.2e-16
#Checking model dignostics 
r=residuals(m1)
plot(r,col="green")

Plot of residuals are scattered around. Errors are normaliy distributed.

# Normality test
shapiro.test(r) 

    Shapiro-Wilk normality test

data:  r
W = 0.98739, p-value = 9.859e-14

As w>95%. Normality assumption is satisfied.

#Removing id colunm
test=test[-1,]
test$spacegroup=as.factor(test$spacegroup)
test$number_of_total_atoms=as.factor(test$number_of_total_atoms)
p=predict(m1,test)
Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) : 
  factor factor(spacegroup) has new levels 12, 33, 167, 194, 206, 227
