#Specify the corresponding folder
setwd("F:\\RData")
library(readxl)
df<-read_xlsx("labW9.xlsx",1)
summary(df)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
#Determine whether there are missing values
anyNA(df)
## [1] FALSE
library(dplyr)
##
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dplyr::arrange(df,desc(Outcome))
## # A tibble: 768 x 9
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6 148 72 35 0 33.6
## 2 8 183 64 0 0 23.3
## 3 0 137 40 35 168 43.1
## 4 3 78 50 32 88 31
## 5 2 197 70 45 543 30.5
## 6 8 125 96 0 0 0
## 7 10 168 74 0 0 38
## 8 1 189 60 23 846 30.1
## 9 5 166 72 19 175 25.8
## 10 7 100 0 0 0 30
## # ... with 758 more rows, and 3 more variables: DiabetesPedigreeFunction <dbl>,
## # Age <dbl>, Outcome <dbl>
#Split data into training and test data subsets
library(ggplot2)
library(lattice)
library(caret)
#Set random seed
set.seed(20220115)
#Divide 70% of the data set as the training set and 30% of the test set
index <- createDataPartition(df$BloodPressure, p=0.7, list=FALSE)
#training data
data_train <- df[index, ]
#test data
data_test <- df[-index, ]
dim(data_train)#The number of rows and columns in the training set
## [1] 540 9
dim(data_test) #The number of rows and columns in the test set
## [1] 228 9
model <- lm(BloodPressure~.,data=data_train)# linear regression model
# summarize results
print(model)
##
## Call:
## lm(formula = BloodPressure ~ ., data = data_train)
##
## Coefficients:
## (Intercept) Pregnancies Glucose
## 26.806800 0.356241 0.061628
## SkinThickness Insulin BMI
## 0.194572 -0.006039 0.703129
## DiabetesPedigreeFunction Age Outcome
## -1.864982 0.310404 -5.762901
#Cross-validation
set.seed(1234)
index<- createFolds(df$BloodPressure,k=10,list=FALSE)
index
## [1] 6 4 2 8 1 10 2 9 3 6 6 5 10 8 4 10 9 6 6 10 3 3 2 1 1
## [26] 10 5 9 7 1 4 5 6 5 2 9 7 5 6 6 4 5 1 9 6 4 2 2 4 9
## [51] 5 6 4 2 9 2 10 5 8 3 4 5 4 4 3 1 7 4 2 4 3 6 6 5 3
## [76] 3 6 5 3 8 3 7 7 3 4 9 8 8 1 7 9 2 3 1 6 5 4 7 5 4
## [101] 2 10 6 10 2 4 8 8 5 2 3 6 5 7 3 5 5 5 4 3 5 2 4 1 2
## [126] 8 7 6 2 8 8 3 2 8 9 7 2 2 1 1 10 5 8 4 8 1 1 7 2 10
## [151] 3 7 4 2 2 10 7 2 6 6 8 4 10 6 2 6 5 9 8 7 6 6 3 8 2
## [176] 5 10 1 7 6 6 4 8 1 8 6 6 7 9 10 6 8 10 2 8 1 7 6 10 1
## [201] 3 9 9 3 10 3 7 9 9 7 7 4 9 10 5 1 6 1 10 2 1 10 3 3 9
## [226] 4 9 5 7 4 10 4 7 2 8 10 5 5 2 9 6 9 1 3 7 9 9 1 7 8
## [251] 1 3 4 10 6 3 4 6 1 10 3 6 8 4 2 1 7 1 10 9 3 8 6 5 5
## [276] 9 8 7 2 3 7 8 8 10 6 10 4 6 5 1 6 6 8 7 8 3 6 4 4 10
## [301] 5 5 8 4 10 7 1 4 10 1 10 2 2 2 6 10 9 9 9 3 9 7 9 6 7
## [326] 7 1 1 9 6 8 10 5 1 7 8 3 3 2 10 9 1 3 2 3 2 10 6 7 4
## [351] 9 9 8 1 2 1 6 5 1 6 8 6 8 6 9 4 2 8 7 9 8 8 10 9 4
## [376] 3 1 5 6 10 7 4 10 3 4 7 4 6 3 4 6 10 7 5 7 1 1 3 1 6
## [401] 3 5 3 6 7 2 3 5 1 1 3 1 10 9 5 5 6 2 4 4 8 7 8 5 4
## [426] 3 9 9 9 2 4 9 5 8 9 9 7 10 10 3 10 8 7 2 9 3 9 7 6 3
## [451] 8 5 7 3 7 2 1 5 5 8 4 1 8 4 6 7 8 7 10 3 8 4 10 7 5
## [476] 7 1 2 8 4 2 4 8 3 4 3 10 3 9 8 8 2 2 7 3 10 8 3 7 9
## [501] 4 7 5 2 8 2 1 4 9 9 8 3 1 8 1 2 1 9 9 8 1 3 9 2 9
## [526] 5 4 5 9 1 10 1 5 6 1 10 1 10 3 2 4 7 4 6 4 4 3 3 4 9
## [551] 1 3 3 3 5 4 10 6 9 8 2 4 6 4 6 10 5 9 10 3 2 1 4 4 1
## [576] 5 4 6 9 1 6 2 1 7 8 9 7 5 10 5 5 1 10 2 5 10 7 10 5 1
## [601] 5 4 10 8 1 3 10 8 9 1 2 2 2 10 9 4 8 7 4 4 9 10 1 1 4
## [626] 1 10 3 8 1 2 2 7 2 2 6 4 2 8 8 10 9 5 10 3 7 9 8 6 7
## [651] 6 8 1 2 8 5 3 5 3 7 8 7 6 7 10 1 5 6 9 6 1 1 7 9 5
## [676] 5 10 9 2 9 10 6 2 10 6 2 5 6 7 7 3 10 3 8 5 7 5 7 7 5
## [701] 5 4 2 3 9 5 7 2 2 9 5 2 10 8 8 10 2 4 1 9 4 7 4 3 7
## [726] 9 3 10 3 4 2 3 8 6 7 8 8 10 8 9 5 6 6 10 9 7 5 3 5 6
## [751] 5 1 8 3 6 10 1 1 6 10 10 5 2 4 8 6 5 10
##[1]232111132133232
testIndex<- which(index==1)
#training data
datTraincv<- df[-testIndex,]
#test data
datTestcv<- df[testIndex,]
#Test the correlation between variables ()
library(car)
## 载入需要的程辑包:carData
##
## 载入程辑包:'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(carData)
## Correlation text display. The correlation coefficient between the two variables can be seen. Generally, when the absolute value of this value >0.8, the two variables are considered highly correlated. Using this function, you can't have NA in the sample.
cor(df)
## Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies 1.00000000 0.12945867 0.14128198 -0.08167177
## Glucose 0.12945867 1.00000000 0.15258959 0.05732789
## BloodPressure 0.14128198 0.15258959 1.00000000 0.20737054
## SkinThickness -0.08167177 0.05732789 0.20737054 1.00000000
## Insulin -0.07353461 0.33135711 0.08893338 0.43678257
## BMI 0.01768309 0.22107107 0.28180529 0.39257320
## DiabetesPedigreeFunction -0.03352267 0.13733730 0.04126495 0.18392757
## Age 0.54434123 0.26351432 0.23952795 -0.11397026
## Outcome 0.22189815 0.46658140 0.06506836 0.07475223
## Insulin BMI DiabetesPedigreeFunction
## Pregnancies -0.07353461 0.01768309 -0.03352267
## Glucose 0.33135711 0.22107107 0.13733730
## BloodPressure 0.08893338 0.28180529 0.04126495
## SkinThickness 0.43678257 0.39257320 0.18392757
## Insulin 1.00000000 0.19785906 0.18507093
## BMI 0.19785906 1.00000000 0.14064695
## DiabetesPedigreeFunction 0.18507093 0.14064695 1.00000000
## Age -0.04216295 0.03624187 0.03356131
## Outcome 0.13054795 0.29269466 0.17384407
## Age Outcome
## Pregnancies 0.54434123 0.22189815
## Glucose 0.26351432 0.46658140
## BloodPressure 0.23952795 0.06506836
## SkinThickness -0.11397026 0.07475223
## Insulin -0.04216295 0.13054795
## BMI 0.03624187 0.29269466
## DiabetesPedigreeFunction 0.03356131 0.17384407
## Age 1.00000000 0.23835598
## Outcome 0.23835598 1.00000000
scatterplotMatrix(df,main="Scatter Plot Matrix")
#Draw a scatter plot
x<-df$BloodPressure
y<-df$Age
plot(x,y)
# Establish a single regression equation
lm.sol<-lm(y~1+x)
#The model summary
summary(lm.sol)
##
## Call:
## lm(formula = y ~ 1 + x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.155 -8.826 -3.535 6.718 48.816
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.18378 1.52951 15.158 < 2e-16 ***
## x 0.14553 0.02131 6.828 1.75e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.43 on 766 degrees of freedom
## Multiple R-squared: 0.05737, Adjusted R-squared: 0.05614
## F-statistic: 46.62 on 1 and 766 DF, p-value: 1.752e-11
#Fitting diagram:
plot(x,y,
xlab = "BloodPressure",
ylab = "Age")
abline(lm.sol)
#Give me the value of x
preds<-data.frame(x=0.24)
#Make interval prediction with 95% accuracy
predict(lm.sol,newdata = preds,interval = "prediction",level = 0.95)
## fit lwr upr
## 1 23.2187 0.5912179 45.84619
#residual analysis
##Stepwise regression of all subsets
lm.step<-step(lm.sol,direction = "both")
## Start: AIC=3743.44
## y ~ 1 + x
##
## Df Sum of Sq RSS AIC
## <none> 99992 3743.4
## - x 1 6086.1 106078 3786.8
##Calculate standard residuals
y.rst<-rstandard(lm.step)
##Calculate the predicted value of the model
y.fit<-predict(lm.step)
##Draw the residual scatter diagram
plot(y.rst~y.fit)
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.