Linear Regression
Predict fat (adipose tissue - AT) in body based on waist circumference.
setwd("E:\\mikhilesh\\Horizon 2020\\ApCoTe Yogesh Sky Analytics DA DS Learning")
data <- read.csv("waist circumfirance - adipose tissue july28.csv")
#data <- read.csv(file.choose()) #another way of selecting the data file from browser
#View(data)
dim(data) #dim() for dimension of datset
## [1] 109 2
class(data) #let's you know if the dataset is in the form of data.frame or array
## [1] "data.frame"
colnames(data) #names of columns
## [1] "Waist" "AT"
#rownames(data) #names of rows - usually not used
str(data) #display the structure of dataset
## 'data.frame': 109 obs. of 2 variables:
## $ Waist: num 74.8 72.6 81.8 84 74.7 ...
## $ AT : num 25.7 25.9 42.6 42.8 29.8 ...
attach(data) #attaching dataset to avoid having to type datasetg name repeatedely. You can add up to 8 datasets.
#detach(data) #function if you want to detach your dataset
sd(AT)
## [1] 57.29476
sd(Waist)
## [1] 13.55912
summary(data) #descriptive statistics
## Waist AT
## Min. : 63.5 Min. : 11.44
## 1st Qu.: 80.0 1st Qu.: 50.88
## Median : 90.8 Median : 96.54
## Mean : 91.9 Mean :101.89
## 3rd Qu.:104.0 3rd Qu.:137.00
## Max. :121.0 Max. :253.00
plot(AT, Waist) #plot(X - input, Y - output)
cor(AT, Waist) #correlation coefficient "r" cor(X - input, Y - output)
## [1] 0.8185578
#r = 0.81 Moderate correlation (r value b/w 0.65-0.85)
#syntax for linear model lm(output Y ~ input X, dataset)
model1 <- lm(AT ~ Waist, data = data)
summary(model1)
##
## Call:
## lm(formula = AT ~ Waist, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -107.288 -19.143 -2.939 16.376 90.342
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -215.9815 21.7963 -9.909 <2e-16 ***
## Waist 3.4589 0.2347 14.740 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 33.06 on 107 degrees of freedom
## Multiple R-squared: 0.67, Adjusted R-squared: 0.667
## F-statistic: 217.3 on 1 and 107 DF, p-value: < 2.2e-16
#default Confidence Interval is 95%, so p - value should be < 0.05 (1.00 - 0.95 = 0.05)
#if we are considereing Confidence interval 90%, p - value should be < 0.10 (1.00 - 0.90 = 0.10)
#IF P-VALUE GREATER than expected Confidence interval, then we should consider that our model is not predicting outcome variable correctly.
#Multiple R-squared: 0.67 - shows that our model is performing 67% accuragtely (MODERATE STRENGTH MODEL)
The equation of regression model is: Y = b0 + b1X We can replace the values of intercept (b0) and slope (b1) with our own values. Our model is: Y = (-215.9815) + (3.4589)X
#IMPROVING PERFORMANCE/ACCURACY OF MODEL - TRANSFORMATION using mathematical operation on data
model2 <- lm(AT ~ log(Waist), data = data) #LOGARITHEMIC TRANSFORMATION - Applying log on INPUT variable
summary(model2)
##
## Call:
## lm(formula = AT ~ log(Waist), data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98.473 -18.273 -2.374 14.538 90.400
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1328.34 95.92 -13.85 <2e-16 ***
## log(Waist) 317.14 21.26 14.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.8 on 107 degrees of freedom
## Multiple R-squared: 0.6753, Adjusted R-squared: 0.6723
## F-statistic: 222.6 on 1 and 107 DF, p-value: < 2.2e-16
#Multiple R-squared: 0.6753 (only a 0.0053% increase in accuracy - not a significant increase)
model3 <- lm(log(AT) ~ Waist, data = data) #Exponential TRANSFORMATION - Applying log on OUTPUT variable
summary(model3)
##
## Call:
## lm(formula = log(AT) ~ Waist, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.05086 -0.21688 0.03623 0.23044 0.82862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.741021 0.232628 3.185 0.00189 **
## Waist 0.040252 0.002504 16.073 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3529 on 107 degrees of freedom
## Multiple R-squared: 0.7071, Adjusted R-squared: 0.7044
## F-statistic: 258.3 on 1 and 107 DF, p-value: < 2.2e-16
#Multiple R-squared: 0.7071 (0.0371% increase in accuracy over model1- not a significant increase, but still better than model2)
#Calculating predicted fat values from waist circumference
pv <- predict(model1, newdata = data) #predict() function for predicting/calculating fat values pv by using all the 109 values for waist in dataset wc.at
pv
## 1 2 3 4 5 6 7
## 42.568252 35.131704 66.953210 74.389758 42.222366 32.537559 63.840237
## 8 9 10 11 12 13 14
## 72.487385 3.656083 37.207020 32.710502 43.432966 36.861134 57.268404
## 15 16 17 18 19 20 21
## 50.350685 22.160981 46.718883 40.492936 39.282335 46.545940 49.831856
## 22 23 24 25 26 27 28
## 63.840237 60.381377 92.548770 67.644982 102.233576 83.555735 62.456693
## 29 30 31 32 33 34 35
## 81.480420 69.374412 72.833271 88.744024 98.082945 93.240542 136.822170
## 36 37 38 39 40 41 42
## 110.880725 98.774717 140.281029 60.727263 57.268404 72.833271 46.891826
## 43 44 45 46 47 48 49
## 62.456693 83.209849 71.103842 154.462353 110.188953 110.880725 59.689606
## 50 51 52 53 54 55 56
## 58.306062 94.624085 73.870929 78.713332 45.162396 55.193088 55.884860
## 57 58 59 60 61 62 63
## 87.706367 82.518078 79.750990 73.525043 52.426001 77.675674 60.035492
## 64 65 66 67 68 69 70
## 158.612984 197.698095 198.735753 117.798443 148.928178 147.198748 154.116467
## 71 72 73 74 75 76 77
## 154.116467 133.363311 119.527873 129.904451 157.575326 129.904451 140.281029
## 78 79 80 81 82 83 84
## 143.739889 150.657608 161.034186 142.010459 164.493045 164.493045 171.410764
## 85 86 87 88 89 90 91
## 159.304756 143.739889 167.951905 159.304756 202.540498 161.034186 121.257303
## 92 93 94 95 96 97 98
## 148.928178 122.986732 110.880725 119.527873 147.198748 150.657608 126.445592
## 99 100 101 102 103 104 105
## 98.774717 138.551600 150.657608 161.380072 181.787342 133.363311 130.250337
## 106 107 108 109
## 106.730093 136.130398 157.229440 159.304756
class(pv) #checking the class of data
## [1] "numeric"
#converting the pv numeric data into data.frame using as.data.frame
pv1 <- as.data.frame(pv)
pv1
## pv
## 1 42.568252
## 2 35.131704
## 3 66.953210
## 4 74.389758
## 5 42.222366
## 6 32.537559
## 7 63.840237
## 8 72.487385
## 9 3.656083
## 10 37.207020
## 11 32.710502
## 12 43.432966
## 13 36.861134
## 14 57.268404
## 15 50.350685
## 16 22.160981
## 17 46.718883
## 18 40.492936
## 19 39.282335
## 20 46.545940
## 21 49.831856
## 22 63.840237
## 23 60.381377
## 24 92.548770
## 25 67.644982
## 26 102.233576
## 27 83.555735
## 28 62.456693
## 29 81.480420
## 30 69.374412
## 31 72.833271
## 32 88.744024
## 33 98.082945
## 34 93.240542
## 35 136.822170
## 36 110.880725
## 37 98.774717
## 38 140.281029
## 39 60.727263
## 40 57.268404
## 41 72.833271
## 42 46.891826
## 43 62.456693
## 44 83.209849
## 45 71.103842
## 46 154.462353
## 47 110.188953
## 48 110.880725
## 49 59.689606
## 50 58.306062
## 51 94.624085
## 52 73.870929
## 53 78.713332
## 54 45.162396
## 55 55.193088
## 56 55.884860
## 57 87.706367
## 58 82.518078
## 59 79.750990
## 60 73.525043
## 61 52.426001
## 62 77.675674
## 63 60.035492
## 64 158.612984
## 65 197.698095
## 66 198.735753
## 67 117.798443
## 68 148.928178
## 69 147.198748
## 70 154.116467
## 71 154.116467
## 72 133.363311
## 73 119.527873
## 74 129.904451
## 75 157.575326
## 76 129.904451
## 77 140.281029
## 78 143.739889
## 79 150.657608
## 80 161.034186
## 81 142.010459
## 82 164.493045
## 83 164.493045
## 84 171.410764
## 85 159.304756
## 86 143.739889
## 87 167.951905
## 88 159.304756
## 89 202.540498
## 90 161.034186
## 91 121.257303
## 92 148.928178
## 93 122.986732
## 94 110.880725
## 95 119.527873
## 96 147.198748
## 97 150.657608
## 98 126.445592
## 99 98.774717
## 100 138.551600
## 101 150.657608
## 102 161.380072
## 103 181.787342
## 104 133.363311
## 105 130.250337
## 106 106.730093
## 107 136.130398
## 108 157.229440
## 109 159.304756
#combining the newly found pv to the original data using cbind - column bind function to attach pv data to the next column of original datset
final1 <- cbind(data, pv)
final1 #shows actual value and predicted values from the LRmodel
## Waist AT pv
## 1 74.75 25.72 42.568252
## 2 72.60 25.89 35.131704
## 3 81.80 42.60 66.953210
## 4 83.95 42.80 74.389758
## 5 74.65 29.84 42.222366
## 6 71.85 21.68 32.537559
## 7 80.90 29.08 63.840237
## 8 83.40 32.98 72.487385
## 9 63.50 11.44 3.656083
## 10 73.20 32.22 37.207020
## 11 71.90 28.32 32.710502
## 12 75.00 43.86 43.432966
## 13 73.10 38.21 36.861134
## 14 79.00 42.48 57.268404
## 15 77.00 30.96 50.350685
## 16 68.85 55.78 22.160981
## 17 75.95 43.78 46.718883
## 18 74.15 33.41 40.492936
## 19 73.80 43.35 39.282335
## 20 75.90 29.31 46.545940
## 21 76.85 36.60 49.831856
## 22 80.90 40.25 63.840237
## 23 79.90 35.43 60.381377
## 24 89.20 60.09 92.548770
## 25 82.00 45.84 67.644982
## 26 92.00 70.40 102.233576
## 27 86.60 83.45 83.555735
## 28 80.50 84.30 62.456693
## 29 86.00 78.89 81.480420
## 30 82.50 64.75 69.374412
## 31 83.50 72.56 72.833271
## 32 88.10 89.31 88.744024
## 33 90.80 78.94 98.082945
## 34 89.40 83.55 93.240542
## 35 102.00 127.00 136.822170
## 36 94.50 121.00 110.880725
## 37 91.00 107.00 98.774717
## 38 103.00 129.00 140.281029
## 39 80.00 74.02 60.727263
## 40 79.00 55.48 57.268404
## 41 83.50 73.13 72.833271
## 42 76.00 50.50 46.891826
## 43 80.50 50.88 62.456693
## 44 86.50 140.00 83.209849
## 45 83.00 96.54 71.103842
## 46 107.10 118.00 154.462353
## 47 94.30 107.00 110.188953
## 48 94.50 123.00 110.880725
## 49 79.70 65.92 59.689606
## 50 79.30 81.29 58.306062
## 51 89.80 111.00 94.624085
## 52 83.80 90.73 73.870929
## 53 85.20 133.00 78.713332
## 54 75.50 41.90 45.162396
## 55 78.40 41.71 55.193088
## 56 78.60 58.16 55.884860
## 57 87.80 88.85 87.706367
## 58 86.30 155.00 82.518078
## 59 85.50 70.77 79.750990
## 60 83.70 75.08 73.525043
## 61 77.60 57.05 52.426001
## 62 84.90 99.73 77.675674
## 63 79.80 27.96 60.035492
## 64 108.30 123.00 158.612984
## 65 119.60 90.41 197.698095
## 66 119.90 106.00 198.735753
## 67 96.50 144.00 117.798443
## 68 105.50 121.00 148.928178
## 69 105.00 97.13 147.198748
## 70 107.00 166.00 154.116467
## 71 107.00 87.99 154.116467
## 72 101.00 154.00 133.363311
## 73 97.00 100.00 119.527873
## 74 100.00 123.00 129.904451
## 75 108.00 217.00 157.575326
## 76 100.00 140.00 129.904451
## 77 103.00 109.00 140.281029
## 78 104.00 127.00 143.739889
## 79 106.00 112.00 150.657608
## 80 109.00 192.00 161.034186
## 81 103.50 132.00 142.010459
## 82 110.00 126.00 164.493045
## 83 110.00 153.00 164.493045
## 84 112.00 158.00 171.410764
## 85 108.50 183.00 159.304756
## 86 104.00 184.00 143.739889
## 87 111.00 121.00 167.951905
## 88 108.50 159.00 159.304756
## 89 121.00 245.00 202.540498
## 90 109.00 137.00 161.034186
## 91 97.50 165.00 121.257303
## 92 105.50 152.00 148.928178
## 93 98.00 181.00 122.986732
## 94 94.50 80.95 110.880725
## 95 97.00 137.00 119.527873
## 96 105.00 125.00 147.198748
## 97 106.00 241.00 150.657608
## 98 99.00 134.00 126.445592
## 99 91.00 150.00 98.774717
## 100 102.50 198.00 138.551600
## 101 106.00 151.00 150.657608
## 102 109.10 229.00 161.380072
## 103 115.00 253.00 181.787342
## 104 101.00 188.00 133.363311
## 105 100.10 124.00 130.250337
## 106 93.30 62.20 106.730093
## 107 101.80 133.00 136.130398
## 108 107.90 208.00 157.229440
## 109 108.50 208.00 159.304756
#NOW we can use model m1 to predict fat values for unknown x patient (only waist circumferenc is known)
#predicting AT adipose tissue using model1 for 4 new patient whos only waist circumfirance is measured
x <- read.csv("C:/Users/Akshada/Downloads/x.csv") #reading x patient data w/ unkown fat values
x
## Waist
## 1 74.1
## 2 56.7
## 3 73.1
## 4 65.3
pred <- predict(model1, newdata = x) #predicting fat values using new dataset x
pred
## 1 2 3 4
## 40.31999 -19.86416 36.86113 9.88203
#Combining predicted fat values next to the original fat values in the original datset (for easy comparison)
finalx <- cbind(x, pred) #columnbind
finalx
## Waist pred
## 1 74.1 40.31999
## 2 56.7 -19.86416
## 3 73.1 36.86113
## 4 65.3 9.88203