setwd("E:\\mikhilesh\\Horizon 2020\\ApCoTe Yogesh Sky Analytics DA DS Learning")
data <- read.csv("waist circumfirance - adipose tissue july28.csv")
#data <- read.csv(file.choose()) #another way of selecting the data file from browser
#View(data)
dim(data) #dim() for dimension of datset
## [1] 109   2
class(data) #let's you know if the dataset is in the form of data.frame or array
## [1] "data.frame"
colnames(data) #names of columns
## [1] "Waist" "AT"
#rownames(data) #names of rows - usually not used
str(data) #display the structure of dataset
## 'data.frame':    109 obs. of  2 variables:
##  $ Waist: num  74.8 72.6 81.8 84 74.7 ...
##  $ AT   : num  25.7 25.9 42.6 42.8 29.8 ...
attach(data) #attaching dataset to avoid having to type datasetg name repeatedely. You can add up to 8 datasets.
#detach(data) #function if you want to detach your dataset
sd(AT)
## [1] 57.29476
sd(Waist)
## [1] 13.55912
summary(data) #descriptive statistics
##      Waist             AT        
##  Min.   : 63.5   Min.   : 11.44  
##  1st Qu.: 80.0   1st Qu.: 50.88  
##  Median : 90.8   Median : 96.54  
##  Mean   : 91.9   Mean   :101.89  
##  3rd Qu.:104.0   3rd Qu.:137.00  
##  Max.   :121.0   Max.   :253.00
plot(AT, Waist) #plot(X - input, Y - output)

cor(AT, Waist) #correlation coefficient "r" cor(X - input, Y - output)
## [1] 0.8185578
#r = 0.81 Moderate correlation (r value b/w 0.65-0.85)
#syntax for linear model lm(output Y ~ input X, dataset)
model1 <- lm(AT ~ Waist, data = data)
summary(model1)
## 
## Call:
## lm(formula = AT ~ Waist, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -107.288  -19.143   -2.939   16.376   90.342 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -215.9815    21.7963  -9.909   <2e-16 ***
## Waist          3.4589     0.2347  14.740   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33.06 on 107 degrees of freedom
## Multiple R-squared:   0.67,  Adjusted R-squared:  0.667 
## F-statistic: 217.3 on 1 and 107 DF,  p-value: < 2.2e-16
#default Confidence Interval is 95%, so p - value should be < 0.05 (1.00 - 0.95 = 0.05)
#if we are considereing Confidence interval 90%, p - value should be < 0.10 (1.00 - 0.90 = 0.10)
#IF P-VALUE GREATER than expected Confidence interval, then we should consider that our model is not predicting outcome variable correctly.

#Multiple R-squared:   0.67 - shows that our model is performing 67% accuragtely (MODERATE STRENGTH MODEL)
#IMPROVING PERFORMANCE/ACCURACY OF MODEL - TRANSFORMATION using mathematical operation on data
model2 <- lm(AT ~ log(Waist), data = data) #LOGARITHEMIC TRANSFORMATION - Applying log on INPUT variable
summary(model2)
## 
## Call:
## lm(formula = AT ~ log(Waist), data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -98.473 -18.273  -2.374  14.538  90.400 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1328.34      95.92  -13.85   <2e-16 ***
## log(Waist)    317.14      21.26   14.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.8 on 107 degrees of freedom
## Multiple R-squared:  0.6753, Adjusted R-squared:  0.6723 
## F-statistic: 222.6 on 1 and 107 DF,  p-value: < 2.2e-16
#Multiple R-squared:   0.6753 (only a 0.0053% increase in accuracy - not a significant increase)

model3 <- lm(log(AT) ~ Waist, data = data) #Exponential TRANSFORMATION - Applying log on OUTPUT variable
summary(model3)
## 
## Call:
## lm(formula = log(AT) ~ Waist, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.05086 -0.21688  0.03623  0.23044  0.82862 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.741021   0.232628   3.185  0.00189 ** 
## Waist       0.040252   0.002504  16.073  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3529 on 107 degrees of freedom
## Multiple R-squared:  0.7071, Adjusted R-squared:  0.7044 
## F-statistic: 258.3 on 1 and 107 DF,  p-value: < 2.2e-16
#Multiple R-squared:   0.7071 (0.0371% increase in accuracy over model1- not a significant increase, but still better than model2)
#predicted value
pv <- predict(model1, newdata = data) #predict() function for predicted values using a model
pv
##          1          2          3          4          5          6          7 
##  42.568252  35.131704  66.953210  74.389758  42.222366  32.537559  63.840237 
##          8          9         10         11         12         13         14 
##  72.487385   3.656083  37.207020  32.710502  43.432966  36.861134  57.268404 
##         15         16         17         18         19         20         21 
##  50.350685  22.160981  46.718883  40.492936  39.282335  46.545940  49.831856 
##         22         23         24         25         26         27         28 
##  63.840237  60.381377  92.548770  67.644982 102.233576  83.555735  62.456693 
##         29         30         31         32         33         34         35 
##  81.480420  69.374412  72.833271  88.744024  98.082945  93.240542 136.822170 
##         36         37         38         39         40         41         42 
## 110.880725  98.774717 140.281029  60.727263  57.268404  72.833271  46.891826 
##         43         44         45         46         47         48         49 
##  62.456693  83.209849  71.103842 154.462353 110.188953 110.880725  59.689606 
##         50         51         52         53         54         55         56 
##  58.306062  94.624085  73.870929  78.713332  45.162396  55.193088  55.884860 
##         57         58         59         60         61         62         63 
##  87.706367  82.518078  79.750990  73.525043  52.426001  77.675674  60.035492 
##         64         65         66         67         68         69         70 
## 158.612984 197.698095 198.735753 117.798443 148.928178 147.198748 154.116467 
##         71         72         73         74         75         76         77 
## 154.116467 133.363311 119.527873 129.904451 157.575326 129.904451 140.281029 
##         78         79         80         81         82         83         84 
## 143.739889 150.657608 161.034186 142.010459 164.493045 164.493045 171.410764 
##         85         86         87         88         89         90         91 
## 159.304756 143.739889 167.951905 159.304756 202.540498 161.034186 121.257303 
##         92         93         94         95         96         97         98 
## 148.928178 122.986732 110.880725 119.527873 147.198748 150.657608 126.445592 
##         99        100        101        102        103        104        105 
##  98.774717 138.551600 150.657608 161.380072 181.787342 133.363311 130.250337 
##        106        107        108        109 
## 106.730093 136.130398 157.229440 159.304756
#converting the pv numeric data into data.frame using as.data.frame
pv1 <- as.data.frame(pv)
pv1
##             pv
## 1    42.568252
## 2    35.131704
## 3    66.953210
## 4    74.389758
## 5    42.222366
## 6    32.537559
## 7    63.840237
## 8    72.487385
## 9     3.656083
## 10   37.207020
## 11   32.710502
## 12   43.432966
## 13   36.861134
## 14   57.268404
## 15   50.350685
## 16   22.160981
## 17   46.718883
## 18   40.492936
## 19   39.282335
## 20   46.545940
## 21   49.831856
## 22   63.840237
## 23   60.381377
## 24   92.548770
## 25   67.644982
## 26  102.233576
## 27   83.555735
## 28   62.456693
## 29   81.480420
## 30   69.374412
## 31   72.833271
## 32   88.744024
## 33   98.082945
## 34   93.240542
## 35  136.822170
## 36  110.880725
## 37   98.774717
## 38  140.281029
## 39   60.727263
## 40   57.268404
## 41   72.833271
## 42   46.891826
## 43   62.456693
## 44   83.209849
## 45   71.103842
## 46  154.462353
## 47  110.188953
## 48  110.880725
## 49   59.689606
## 50   58.306062
## 51   94.624085
## 52   73.870929
## 53   78.713332
## 54   45.162396
## 55   55.193088
## 56   55.884860
## 57   87.706367
## 58   82.518078
## 59   79.750990
## 60   73.525043
## 61   52.426001
## 62   77.675674
## 63   60.035492
## 64  158.612984
## 65  197.698095
## 66  198.735753
## 67  117.798443
## 68  148.928178
## 69  147.198748
## 70  154.116467
## 71  154.116467
## 72  133.363311
## 73  119.527873
## 74  129.904451
## 75  157.575326
## 76  129.904451
## 77  140.281029
## 78  143.739889
## 79  150.657608
## 80  161.034186
## 81  142.010459
## 82  164.493045
## 83  164.493045
## 84  171.410764
## 85  159.304756
## 86  143.739889
## 87  167.951905
## 88  159.304756
## 89  202.540498
## 90  161.034186
## 91  121.257303
## 92  148.928178
## 93  122.986732
## 94  110.880725
## 95  119.527873
## 96  147.198748
## 97  150.657608
## 98  126.445592
## 99   98.774717
## 100 138.551600
## 101 150.657608
## 102 161.380072
## 103 181.787342
## 104 133.363311
## 105 130.250337
## 106 106.730093
## 107 136.130398
## 108 157.229440
## 109 159.304756
#combining the newly found pv to the original data using cbind - column bind  function to attach pv data to the next column of original datset
final1 <- cbind(data, pv)
final1 #shows actual value and predicted values from the LRmodel
##      Waist     AT         pv
## 1    74.75  25.72  42.568252
## 2    72.60  25.89  35.131704
## 3    81.80  42.60  66.953210
## 4    83.95  42.80  74.389758
## 5    74.65  29.84  42.222366
## 6    71.85  21.68  32.537559
## 7    80.90  29.08  63.840237
## 8    83.40  32.98  72.487385
## 9    63.50  11.44   3.656083
## 10   73.20  32.22  37.207020
## 11   71.90  28.32  32.710502
## 12   75.00  43.86  43.432966
## 13   73.10  38.21  36.861134
## 14   79.00  42.48  57.268404
## 15   77.00  30.96  50.350685
## 16   68.85  55.78  22.160981
## 17   75.95  43.78  46.718883
## 18   74.15  33.41  40.492936
## 19   73.80  43.35  39.282335
## 20   75.90  29.31  46.545940
## 21   76.85  36.60  49.831856
## 22   80.90  40.25  63.840237
## 23   79.90  35.43  60.381377
## 24   89.20  60.09  92.548770
## 25   82.00  45.84  67.644982
## 26   92.00  70.40 102.233576
## 27   86.60  83.45  83.555735
## 28   80.50  84.30  62.456693
## 29   86.00  78.89  81.480420
## 30   82.50  64.75  69.374412
## 31   83.50  72.56  72.833271
## 32   88.10  89.31  88.744024
## 33   90.80  78.94  98.082945
## 34   89.40  83.55  93.240542
## 35  102.00 127.00 136.822170
## 36   94.50 121.00 110.880725
## 37   91.00 107.00  98.774717
## 38  103.00 129.00 140.281029
## 39   80.00  74.02  60.727263
## 40   79.00  55.48  57.268404
## 41   83.50  73.13  72.833271
## 42   76.00  50.50  46.891826
## 43   80.50  50.88  62.456693
## 44   86.50 140.00  83.209849
## 45   83.00  96.54  71.103842
## 46  107.10 118.00 154.462353
## 47   94.30 107.00 110.188953
## 48   94.50 123.00 110.880725
## 49   79.70  65.92  59.689606
## 50   79.30  81.29  58.306062
## 51   89.80 111.00  94.624085
## 52   83.80  90.73  73.870929
## 53   85.20 133.00  78.713332
## 54   75.50  41.90  45.162396
## 55   78.40  41.71  55.193088
## 56   78.60  58.16  55.884860
## 57   87.80  88.85  87.706367
## 58   86.30 155.00  82.518078
## 59   85.50  70.77  79.750990
## 60   83.70  75.08  73.525043
## 61   77.60  57.05  52.426001
## 62   84.90  99.73  77.675674
## 63   79.80  27.96  60.035492
## 64  108.30 123.00 158.612984
## 65  119.60  90.41 197.698095
## 66  119.90 106.00 198.735753
## 67   96.50 144.00 117.798443
## 68  105.50 121.00 148.928178
## 69  105.00  97.13 147.198748
## 70  107.00 166.00 154.116467
## 71  107.00  87.99 154.116467
## 72  101.00 154.00 133.363311
## 73   97.00 100.00 119.527873
## 74  100.00 123.00 129.904451
## 75  108.00 217.00 157.575326
## 76  100.00 140.00 129.904451
## 77  103.00 109.00 140.281029
## 78  104.00 127.00 143.739889
## 79  106.00 112.00 150.657608
## 80  109.00 192.00 161.034186
## 81  103.50 132.00 142.010459
## 82  110.00 126.00 164.493045
## 83  110.00 153.00 164.493045
## 84  112.00 158.00 171.410764
## 85  108.50 183.00 159.304756
## 86  104.00 184.00 143.739889
## 87  111.00 121.00 167.951905
## 88  108.50 159.00 159.304756
## 89  121.00 245.00 202.540498
## 90  109.00 137.00 161.034186
## 91   97.50 165.00 121.257303
## 92  105.50 152.00 148.928178
## 93   98.00 181.00 122.986732
## 94   94.50  80.95 110.880725
## 95   97.00 137.00 119.527873
## 96  105.00 125.00 147.198748
## 97  106.00 241.00 150.657608
## 98   99.00 134.00 126.445592
## 99   91.00 150.00  98.774717
## 100 102.50 198.00 138.551600
## 101 106.00 151.00 150.657608
## 102 109.10 229.00 161.380072
## 103 115.00 253.00 181.787342
## 104 101.00 188.00 133.363311
## 105 100.10 124.00 130.250337
## 106  93.30  62.20 106.730093
## 107 101.80 133.00 136.130398
## 108 107.90 208.00 157.229440
## 109 108.50 208.00 159.304756
#predicting AT adipose tissue using model1 for 4 new patient whos only waist circumfirance is measured
X <- read.csv("E:\\mikhilesh\\Horizon 2020\\ApCoTe Yogesh Sky Analytics DA DS Learning\\waist circumfirance - unknown X adipose tissue july28.csv")
pred1 <- predict(model1, newdata = X)
pred1
##         1         2         3         4 
##  40.31999 -19.86416  36.86113   9.88203