#Specify the corresponding folder
setwd("F:\\RData")
library(readxl)
df<-read_xlsx("labW9.xlsx",1)
summary(df)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000
#Determine whether there are missing values
anyNA(df)
## [1] FALSE
library(dplyr)
## 
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dplyr::arrange(df,desc(Outcome))
## # A tibble: 768 x 9
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1           6     148            72            35       0  33.6
##  2           8     183            64             0       0  23.3
##  3           0     137            40            35     168  43.1
##  4           3      78            50            32      88  31  
##  5           2     197            70            45     543  30.5
##  6           8     125            96             0       0   0  
##  7          10     168            74             0       0  38  
##  8           1     189            60            23     846  30.1
##  9           5     166            72            19     175  25.8
## 10           7     100             0             0       0  30  
## # ... with 758 more rows, and 3 more variables: DiabetesPedigreeFunction <dbl>,
## #   Age <dbl>, Outcome <dbl>
#Split data into training and test data subsets
library(ggplot2)
library(lattice)
library(caret)


#Set random seed
set.seed(20220115)
#Divide 70% of the data set as the training set and 30% of the test set
index <-  createDataPartition(df$BloodPressure, p=0.7, list=FALSE)
#training data
data_train <- df[index, ]
#test data
data_test <- df[-index, ]
dim(data_train)#The number of rows and columns in the training set
## [1] 540   9
dim(data_test) #The number of rows and columns in the test set
## [1] 228   9
model <- lm(BloodPressure~.,data=data_train)# linear regression model

# summarize results
print(model)
## 
## Call:
## lm(formula = BloodPressure ~ ., data = data_train)
## 
## Coefficients:
##              (Intercept)               Pregnancies                   Glucose  
##                26.806800                  0.356241                  0.061628  
##            SkinThickness                   Insulin                       BMI  
##                 0.194572                 -0.006039                  0.703129  
## DiabetesPedigreeFunction                       Age                   Outcome  
##                -1.864982                  0.310404                 -5.762901
#Cross-validation
set.seed(1234)
index<- createFolds(df$BloodPressure,k=10,list=FALSE) 

index
##   [1]  6  4  2  8  1 10  2  9  3  6  6  5 10  8  4 10  9  6  6 10  3  3  2  1  1
##  [26] 10  5  9  7  1  4  5  6  5  2  9  7  5  6  6  4  5  1  9  6  4  2  2  4  9
##  [51]  5  6  4  2  9  2 10  5  8  3  4  5  4  4  3  1  7  4  2  4  3  6  6  5  3
##  [76]  3  6  5  3  8  3  7  7  3  4  9  8  8  1  7  9  2  3  1  6  5  4  7  5  4
## [101]  2 10  6 10  2  4  8  8  5  2  3  6  5  7  3  5  5  5  4  3  5  2  4  1  2
## [126]  8  7  6  2  8  8  3  2  8  9  7  2  2  1  1 10  5  8  4  8  1  1  7  2 10
## [151]  3  7  4  2  2 10  7  2  6  6  8  4 10  6  2  6  5  9  8  7  6  6  3  8  2
## [176]  5 10  1  7  6  6  4  8  1  8  6  6  7  9 10  6  8 10  2  8  1  7  6 10  1
## [201]  3  9  9  3 10  3  7  9  9  7  7  4  9 10  5  1  6  1 10  2  1 10  3  3  9
## [226]  4  9  5  7  4 10  4  7  2  8 10  5  5  2  9  6  9  1  3  7  9  9  1  7  8
## [251]  1  3  4 10  6  3  4  6  1 10  3  6  8  4  2  1  7  1 10  9  3  8  6  5  5
## [276]  9  8  7  2  3  7  8  8 10  6 10  4  6  5  1  6  6  8  7  8  3  6  4  4 10
## [301]  5  5  8  4 10  7  1  4 10  1 10  2  2  2  6 10  9  9  9  3  9  7  9  6  7
## [326]  7  1  1  9  6  8 10  5  1  7  8  3  3  2 10  9  1  3  2  3  2 10  6  7  4
## [351]  9  9  8  1  2  1  6  5  1  6  8  6  8  6  9  4  2  8  7  9  8  8 10  9  4
## [376]  3  1  5  6 10  7  4 10  3  4  7  4  6  3  4  6 10  7  5  7  1  1  3  1  6
## [401]  3  5  3  6  7  2  3  5  1  1  3  1 10  9  5  5  6  2  4  4  8  7  8  5  4
## [426]  3  9  9  9  2  4  9  5  8  9  9  7 10 10  3 10  8  7  2  9  3  9  7  6  3
## [451]  8  5  7  3  7  2  1  5  5  8  4  1  8  4  6  7  8  7 10  3  8  4 10  7  5
## [476]  7  1  2  8  4  2  4  8  3  4  3 10  3  9  8  8  2  2  7  3 10  8  3  7  9
## [501]  4  7  5  2  8  2  1  4  9  9  8  3  1  8  1  2  1  9  9  8  1  3  9  2  9
## [526]  5  4  5  9  1 10  1  5  6  1 10  1 10  3  2  4  7  4  6  4  4  3  3  4  9
## [551]  1  3  3  3  5  4 10  6  9  8  2  4  6  4  6 10  5  9 10  3  2  1  4  4  1
## [576]  5  4  6  9  1  6  2  1  7  8  9  7  5 10  5  5  1 10  2  5 10  7 10  5  1
## [601]  5  4 10  8  1  3 10  8  9  1  2  2  2 10  9  4  8  7  4  4  9 10  1  1  4
## [626]  1 10  3  8  1  2  2  7  2  2  6  4  2  8  8 10  9  5 10  3  7  9  8  6  7
## [651]  6  8  1  2  8  5  3  5  3  7  8  7  6  7 10  1  5  6  9  6  1  1  7  9  5
## [676]  5 10  9  2  9 10  6  2 10  6  2  5  6  7  7  3 10  3  8  5  7  5  7  7  5
## [701]  5  4  2  3  9  5  7  2  2  9  5  2 10  8  8 10  2  4  1  9  4  7  4  3  7
## [726]  9  3 10  3  4  2  3  8  6  7  8  8 10  8  9  5  6  6 10  9  7  5  3  5  6
## [751]  5  1  8  3  6 10  1  1  6 10 10  5  2  4  8  6  5 10
##[1]232111132133232 
testIndex<- which(index==1) 
#training data
datTraincv<- df[-testIndex,]
#test data
datTestcv<- df[testIndex,]


#Test the correlation between variables ()
library(car)
## 载入需要的程辑包:carData
## 
## 载入程辑包:'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(carData)
## Correlation text display. The correlation coefficient between the two variables can be seen. Generally, when the absolute value of this value >0.8, the two variables are considered highly correlated. Using this function, you can't have NA in the sample.
cor(df)  
##                          Pregnancies    Glucose BloodPressure SkinThickness
## Pregnancies               1.00000000 0.12945867    0.14128198   -0.08167177
## Glucose                   0.12945867 1.00000000    0.15258959    0.05732789
## BloodPressure             0.14128198 0.15258959    1.00000000    0.20737054
## SkinThickness            -0.08167177 0.05732789    0.20737054    1.00000000
## Insulin                  -0.07353461 0.33135711    0.08893338    0.43678257
## BMI                       0.01768309 0.22107107    0.28180529    0.39257320
## DiabetesPedigreeFunction -0.03352267 0.13733730    0.04126495    0.18392757
## Age                       0.54434123 0.26351432    0.23952795   -0.11397026
## Outcome                   0.22189815 0.46658140    0.06506836    0.07475223
##                              Insulin        BMI DiabetesPedigreeFunction
## Pregnancies              -0.07353461 0.01768309              -0.03352267
## Glucose                   0.33135711 0.22107107               0.13733730
## BloodPressure             0.08893338 0.28180529               0.04126495
## SkinThickness             0.43678257 0.39257320               0.18392757
## Insulin                   1.00000000 0.19785906               0.18507093
## BMI                       0.19785906 1.00000000               0.14064695
## DiabetesPedigreeFunction  0.18507093 0.14064695               1.00000000
## Age                      -0.04216295 0.03624187               0.03356131
## Outcome                   0.13054795 0.29269466               0.17384407
##                                  Age    Outcome
## Pregnancies               0.54434123 0.22189815
## Glucose                   0.26351432 0.46658140
## BloodPressure             0.23952795 0.06506836
## SkinThickness            -0.11397026 0.07475223
## Insulin                  -0.04216295 0.13054795
## BMI                       0.03624187 0.29269466
## DiabetesPedigreeFunction  0.03356131 0.17384407
## Age                       1.00000000 0.23835598
## Outcome                   0.23835598 1.00000000
scatterplotMatrix(df,main="Scatter Plot Matrix") 

#Draw a scatter plot
x<-df$BloodPressure
y<-df$Age
plot(x,y)

# Establish a single regression equation
lm.sol<-lm(y~1+x)

#The model summary
summary(lm.sol)
## 
## Call:
## lm(formula = y ~ 1 + x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.155  -8.826  -3.535   6.718  48.816 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 23.18378    1.52951  15.158  < 2e-16 ***
## x            0.14553    0.02131   6.828 1.75e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.43 on 766 degrees of freedom
## Multiple R-squared:  0.05737,    Adjusted R-squared:  0.05614 
## F-statistic: 46.62 on 1 and 766 DF,  p-value: 1.752e-11
#Fitting diagram:
plot(x,y,
     xlab = "BloodPressure",
     ylab = "Age")
abline(lm.sol)

#Give me the value of x
preds<-data.frame(x=0.24)

#Make interval prediction with 95% accuracy
predict(lm.sol,newdata = preds,interval = "prediction",level = 0.95)
##       fit       lwr      upr
## 1 23.2187 0.5912179 45.84619
#residual analysis
##Stepwise regression of all subsets
lm.step<-step(lm.sol,direction = "both")
## Start:  AIC=3743.44
## y ~ 1 + x
## 
##        Df Sum of Sq    RSS    AIC
## <none>               99992 3743.4
## - x     1    6086.1 106078 3786.8
##Calculate standard residuals
y.rst<-rstandard(lm.step)

##Calculate the predicted value of the model
y.fit<-predict(lm.step)

##Draw the residual scatter diagram
plot(y.rst~y.fit)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.