Loading dataset from Desktop

LungCapData <- read.delim(file.choose(),header = T)
attach(LungCapData)
names(LungCapData)
## [1] "LungCap"   "Age"       "Height"    "Smoke"     "Gender"    "Caesarean"





Convert the Numeric variable (Height) to Factor or Categorical Varriable

Here height will be broken down into 6 parts

<50 = A

50~55 = B

55~60 = C

60~65 = D

65~70 = E

>70 = F

Factor_height <- cut(Height, breaks = c(0,50,55,60,65,70,100),
                     labels = c("A","B","C","D","E","F"),right = FALSE)





Check out the first 10 values of Height

Height[1:10]
##  [1] 62.1 74.7 69.7 71.0 56.9 58.7 63.3 70.4 70.5 59.2
Factor_height[1:10]
##  [1] D F E F C C D F F C
## Levels: A B C D E F





If I wanna break down the variable into n parts

Here, n = 4

Factor_height <- cut(Height, breaks = 4,
                     labels = c("A","B","c","D"),right = FALSE)





Result

Height[1:10]
##  [1] 62.1 74.7 69.7 71.0 56.9 58.7 63.3 70.4 70.5 59.2
Factor_height[1:10]
##  [1] B D c c B B B c c B
## Levels: A B c D





Compute the mean-max-min throgh the Factorization group


Mean of LungCap of group A

mean(LungCap[Factor_height == "A"])
## [1] 3.082371


Max of LungCap of group

max(LungCap[Factor_height == "B"])
## [1] 9.925





Compute how many obs. do smoking or not

table(Smoke)
## Smoke
##  no yes 
## 648  77





Regression Model (LungCap vs Age and Smoke)

m = lm(LungCap~Age+Smoke)
summary(m)
## 
## Call:
## lm(formula = LungCap ~ Age + Smoke)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8559 -1.0289 -0.0363  1.0083  4.1995 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.08572    0.18299   5.933 4.61e-09 ***
## Age          0.55540    0.01438  38.628  < 2e-16 ***
## Smokeyes    -0.64859    0.18676  -3.473 0.000546 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.514 on 722 degrees of freedom
## Multiple R-squared:  0.6773, Adjusted R-squared:  0.6764 
## F-statistic: 757.5 on 2 and 722 DF,  p-value: < 2.2e-16





PLotting the Regression Model (LungCap vs Age,Smoking)

plot - Plotting the data for Non-Smoker

points - Plotting the data for Smoker

legend - adding legends

1st abline - Regression Line for Non-Smokers (here the coef os smoking = 0)

2nd abline - Regression Line for Smokers (here the coef os smoking = 1)

plot(Age[Smoke=="no"], LungCap[Smoke=="no"], col = "yellow",xlim = c(0,20),
     ylim = c(0,15),
     xlab = "Age", ylab = "LungCap", main = "LungCap vs Age,Smoking")
points(Age[Smoke=="yes"], LungCap[Smoke=="yes"],  col= "red", pch = 16)
legend(1,15,legend = c("Non-Smoker","Smoker"),col = c("Yellow","red"),pch = c(1,16),bty= "n")
abline(a = 1.08,b= 0.555, col = "yellow",lwd = 3)
abline(a = .431,b= 0.555, col = "red",lwd = 3)





PLotting the Regression Model(LungCap vs Age,Factor_height)

plot(Age[Factor_height=="A"], LungCap[Factor_height=="A"], col = 2,xlim = c(0,20),
     ylim = c(0,15),
     xlab = "Age", ylab = "LungCap", main = "LungCap vs Age,Factor_height")
points(Age[Factor_height=="B"], LungCap[Factor_height=="B"], col = 3)
points(Age[Factor_height=="C"], LungCap[Factor_height=="C"], col = 4)
points(Age[Factor_height=="D"], LungCap[Factor_height=="D"], col = 5)
points(Age[Factor_height=="E"], LungCap[Factor_height=="E"], col = 6)
points(Age[Factor_height=="F"], LungCap[Factor_height=="F"], col = 7)
legend(1,15,legend =  c("A","B","C","D","E","F"),col = 2:7,pch = 1, cex = 0.8)
abline(a = 0.98,b= 0.2, col = 2,lwd = 3)
abline(a = 2.46,b= 0.2, col = 3,lwd = 3)
abline(a = 3.67,b= 0.2, col = 4,lwd = 3)
abline(a = 4.92,b= 0.2, col = 5,lwd = 3)
abline(a = 5.99,b= 0.2, col = 6,lwd = 3)
abline(a = 7.52,b= 0.2, col = 7,lwd = 3)