simple linear regression assignment

computer_data <- read.csv("C:\\Users\\amits\\Desktop\\sconcept\\datascience training\\assignment\\datasetsandcodesandassignments\\Computer_Data.csv")
head(computer_data)
##   X price speed  hd ram screen cd multi premium ads trend
## 1 1  1499    25  80   4     14 no    no     yes  94     1
## 2 2  1795    33  85   2     14 no    no     yes  94     1
## 3 3  1595    25 170   4     15 no    no     yes  94     1
## 4 4  1849    25 170   8     14 no    no      no  94     1
## 5 5  3295    33 340  16     14 no    no     yes  94     1
## 6 6  3695    66 340  16     14 no    no     yes  94     1
head(computer_data[,c(4,2)]) 
##    hd price
## 1  80  1499
## 2  85  1795
## 3 170  1595
## 4 170  1849
## 5 340  3295
## 6 340  3695
pricehd <- computer_data[,c(4,2)]  

getting price and hd columns ony

head(pricehd)
##    hd price
## 1  80  1499
## 2  85  1795
## 3 170  1595
## 4 170  1849
## 5 340  3295
## 6 340  3695
attach(pricehd)
View(pricehd)
summary(pricehd)
##        hd             price     
##  Min.   :  80.0   Min.   : 949  
##  1st Qu.: 214.0   1st Qu.:1794  
##  Median : 340.0   Median :2144  
##  Mean   : 416.6   Mean   :2220  
##  3rd Qu.: 528.0   3rd Qu.:2595  
##  Max.   :2100.0   Max.   :5399

normal distribution

windows() 
qqnorm(price)
qqline(price)

## scatter plot

windows()
plot(price ~ hd)  

cor(price,hd) 
## [1] 0.4302578

r< 0.65 , poor correlation

m1 <- lm(price ~ hd)  #linear model
summary(m1)
## 
## Call:
## lm(formula = price ~ hd)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1338.45  -382.23   -44.47   315.34  2674.65 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.817e+03  1.257e+01   144.6   <2e-16 ***
## hd          9.665e-01  2.564e-02    37.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 524.3 on 6257 degrees of freedom
## Multiple R-squared:  0.1851, Adjusted R-squared:  0.185 
## F-statistic:  1421 on 1 and 6257 DF,  p-value: < 2.2e-16

R-squared: 0.1851, accuracy is 18%

p-value: < 2.2e-16 , significant code: three star so <0.05

prediction

pv <- predict(m1,pricehd)
class(pv) #data type
## [1] "numeric"
pv1 <- as.data.frame(pv) #numeric to tabular form, pv is y^
final<- cbind(pricehd, pv1) #column bind
View(final)
setwd("C:\\Users\\amits\\Desktop\\sconcept\\datascience training\\assignment")
write.csv(final,"Aug16_SLR_assignment.csv")
getwd()
## [1] "C:/Users/amits/Desktop/sconcept/datascience training/assignment"
test <- read.csv("C:\\Users\\amits\\Desktop\\sconcept\\datascience training\\assignment\\Aug16_SLR_assignment.csv")
View(test)

Increasing accuracy (transformation)

logarithmic

m2 <-lm(price ~ log(hd))  
summary(m2)   
## 
## Call:
## lm(formula = price ~ log(hd))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1231.30  -366.59   -22.54   312.96  2644.09 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -301.88      61.92  -4.875 1.11e-06 ***
## log(hd)       431.14      10.53  40.951  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 515.8 on 6257 degrees of freedom
## Multiple R-squared:  0.2114, Adjusted R-squared:  0.2112 
## F-statistic:  1677 on 1 and 6257 DF,  p-value: < 2.2e-16

##R-squared: 0.2114, accuracy 21.1%

View(predict(m2, newdata =test))

square root

m3 <-lm(price ~ sqrt(hd))  
summary(m3)   
## 
## Call:
## lm(formula = price ~ sqrt(hd))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1306.15  -371.62   -45.63   309.23  2630.29 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1362.637     22.372   60.91   <2e-16 ***
## sqrt(hd)      43.909      1.096   40.06   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 518.2 on 6257 degrees of freedom
## Multiple R-squared:  0.2041, Adjusted R-squared:  0.204 
## F-statistic:  1605 on 1 and 6257 DF,  p-value: < 2.2e-16

R-squared: 0.2041, accuracy 20.4%

View(predict(m3, newdata =test))

exponential transformation (log at output)

m4 <-lm(log(price) ~ hd)  
summary(m4) 
## 
## Call:
## lm(formula = log(price) ~ hd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6935 -0.1643  0.0023  0.1588  0.7987 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.493e+00  5.584e-03  1341.9   <2e-16 ***
## hd          4.293e-04  1.139e-05    37.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2329 on 6257 degrees of freedom
## Multiple R-squared:  0.1851, Adjusted R-squared:  0.1849 
## F-statistic:  1421 on 1 and 6257 DF,  p-value: < 2.2e-16

R-squared: 0.1851, accuracy 18.5%

p4<-predict(m4, newdata=test)

squared

m4 <-lm(price ~ hd^2)  
summary(m4) 
## 
## Call:
## lm(formula = price ~ hd^2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1338.45  -382.23   -44.47   315.34  2674.65 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.817e+03  1.257e+01   144.6   <2e-16 ***
## hd          9.665e-01  2.564e-02    37.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 524.3 on 6257 degrees of freedom
## Multiple R-squared:  0.1851, Adjusted R-squared:  0.185 
## F-statistic:  1421 on 1 and 6257 DF,  p-value: < 2.2e-16
## R-squared:  0.185,  accuracy 18.5% no change
head(test)
##   X  hd price       pv
## 1 1  80  1499 1894.240
## 2 2  85  1795 1899.073
## 3 3 170  1595 1981.228
## 4 4 170  1849 1981.228
## 5 5 340  3295 2145.539
## 6 6 340  3695 2145.539
p4<-predict(m4, newdata=test)