Overview

This document explains how to create a simple linear regression using R. The data used for the purpose is clinical trials data obtained from ACCT website. ACCT provides the data in well table structured in text format. The document explains steps for building a linear regression model using training data, testing the model on a test data and evaluating the model using R programming language.
For more detailed explaination and interpretations of results, and also to read other articles on Clinical Trials visit my blog here at https://businessintelligencedw.blogspot.com/.

Linear regression model

The two variables that we are interested in the datasets are the number of studies completed by a study sponsor and number of studies with results posted by a sponsor. We want to create a model that predicts the number of results a sponsor would post (y or dependent variable) for a given number of completed studies (x or independent variable).

sponsor_f_lm<-subset.data.frame(sponsor_f, subset=cnt_completed_status>=1, select=c("name","agency_class","sponsor_type","flag_sponsor_industry","flag_sponsor_type_academic", "flag_sponsor_type_hospital","cnt_completed_status","cnt_results_submitted"))

Let’s take a look at the summary stats of x and y variables.

nrow(sponsor_f)
## [1] 28069
nrow(sponsor_f_lm)
## [1] 16890
count(sponsor_f_lm, "agency_class")
## # A tibble: 16,890 x 7
## # Groups:   name, agency_class, sponsor_type, flag_sponsor_industry,
## #   flag_sponsor_type_academic [16,890]
##    name  agency_class sponsor_type flag_sponsor_in… flag_sponsor_ty…
##    <fct> <fct>        <fct>        <fct>            <fct>           
##  1 153r… Other        Hospital     0                0               
##  2 1st … Other        Hospital     0                0               
##  3 21st… Other        NA           0                0               
##  4 22EON Industry     NA           1                0               
##  5 23an… Industry     NA           1                0               
##  6 251 … Other        Hospital     0                0               
##  7 2C T… Industry     NA           1                0               
##  8 3-C … Industry     Academic     1                1               
##  9 307 … Other        Hospital     0                0               
## 10 3E T… Industry     NA           1                0               
## # … with 16,880 more rows, and 2 more variables: `"agency_class"` <chr>,
## #   n <int>
count(sponsor_f_lm, "cnt_completed_status")
## # A tibble: 16,890 x 7
## # Groups:   name, agency_class, sponsor_type, flag_sponsor_industry,
## #   flag_sponsor_type_academic [16,890]
##    name  agency_class sponsor_type flag_sponsor_in… flag_sponsor_ty…
##    <fct> <fct>        <fct>        <fct>            <fct>           
##  1 153r… Other        Hospital     0                0               
##  2 1st … Other        Hospital     0                0               
##  3 21st… Other        NA           0                0               
##  4 22EON Industry     NA           1                0               
##  5 23an… Industry     NA           1                0               
##  6 251 … Other        Hospital     0                0               
##  7 2C T… Industry     NA           1                0               
##  8 3-C … Industry     Academic     1                1               
##  9 307 … Other        Hospital     0                0               
## 10 3E T… Industry     NA           1                0               
## # … with 16,880 more rows, and 2 more variables:
## #   `"cnt_completed_status"` <chr>, n <int>
summary(sponsor_f_lm$cnt_completed_status)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    1.000    1.000    1.000    9.918    4.000 2830.000
summary(sponsor_f_lm$cnt_results_submitted)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.0000   0.0000   0.0000   0.7848   0.0000 575.0000

Histogram: Let’s take a look at the distribution.

#Create histogram
plot_hist_1<-ggplot(data = sponsor_f_lm,aes(x=cnt_completed_status))+
  geom_histogram(binwidth = 5)+
  ggtitle("Histogram: Completed Studies")+
  xlab("Completed")+
  ylab("Sponsors Count")+
  xlim(0,200)+
  ylim(0,600)+
  theme_bw()
ggplot_hist_1<-ggplotly(plot_hist_1)
## Warning: Removed 135 rows containing non-finite values (stat_bin).
ggplot_hist_1

Let’s take a look if there is any relationship between our X and Y variables.

Scatterplot 1:

#scatterplot
plot_scatter_1<-ggplot(data=sponsor_f_lm, aes(x=cnt_completed_status, y=cnt_results_submitted))+
  geom_point()+
  geom_smooth(method = "lm")+
  ggtitle("Completed Vs Results Posted")+
  xlab("Completed Studies")+
  ylab("Results Posted")+
  theme_bw()
ggplot_scatter_1<-ggplotly(plot_scatter_1)
ggplot_scatter_1

Let’s divide our data into 2 sets. We will randomly assign 70% of the data as training data and remaining 30% to test data set.

#create linear regression using training and test data

#generate training and test data
set.seed(123)
sampling_index<- sample(x=2, nrow(sponsor_f_lm), replace = TRUE, prob = c(0.7,0.3))

train_set<-sponsor_f_lm[sampling_index==1,]
test_set<-sponsor_f_lm[sampling_index==2,]

#create regression model
lin_model_1<-lm(formula=cnt_results_submitted ~ cnt_completed_status,data=train_set)
summary(lin_model_1)
## 
## Call:
## lm(formula = cnt_results_submitted ~ cnt_completed_status, data = train_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -117.779    0.048    0.291    0.291  231.437 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.4122187  0.0452308  -9.114   <2e-16 ***
## cnt_completed_status  0.1215459  0.0007787 156.094   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.857 on 11862 degrees of freedom
## Multiple R-squared:  0.6726, Adjusted R-squared:  0.6725 
## F-statistic: 2.437e+04 on 1 and 11862 DF,  p-value: < 2.2e-16

There are 16890 records in the data set. Train data set got 11864 records where as test set got 5026.

Evaluate the regression results on train set:

#view the regression results
#head(tidy(results))#provides limited columns of regression summary
#head(augment(results))#provides all columns like residual and fitted
lmresults_tidy<-augment(lin_model_1)

#plot residuals
plot_eval1<-ggplot(data=lmresults_tidy, aes(x=.fitted, y=.resid))+
  geom_point()+
  geom_hline(yintercept=0)+
  ggtitle("Residual Plot - Training Set")+
  theme_bw()
ggplot_eval1<-ggplotly(plot_eval1)
ggplot_eval1

Execute the model on test data set to make predictions:

#store predicted results executing model on test set
lm_pred<-predict(lin_model_1, test_set)

#calculate RMSE of predicted residuals RMSE(actual, predicted)
test_RMSE<-RMSE(test_set$cnt_results_submitted,lm_pred)

The RMSE for predictions on test set is: 4.840729

Plot Actual Vs Predicted for test set:

#add the predicted column to test data set
lmpred_test<-data.frame(lm_pred,test_set)

#plot actual vs predicted
plot_lmpredict<-ggplot(data=lmpred_test, aes(x=cnt_results_submitted, y=lm_pred))+
  geom_point()+
  geom_abline(intercept=0, slope=1)+
  ggtitle("Actual Vs Predicted - Test Set")+
  theme_bw()
ggplot_lmpredict<-ggplotly(plot_lmpredict)
ggplot_lmpredict

Plot residuals for predicted values on test data set:

#create a new col for residual=(act-predicted)
lmpred_test$residual<-lmpred_test$cnt_results_submitted-lmpred_test$lm_pred

#plot residuals
plot_lmpred_residual<-ggplot(data=lmpred_test, aes(x=lm_pred, y=residual))+
  geom_point()+
  geom_hline(yintercept=0)+
  ggtitle("Residual Plot - Test Set")+
  theme_bw()
ggplot_lmpred_residual<-ggplotly(plot_lmpred_residual)
ggplot_lmpred_residual

End of Document