R has two native data formats—Rdata (sometimes shortened to Rda) and Rds. These formats are used when R objects are saved for later use. Rdata is used to save multiple R objects, while Rds is used to save a single R object.

dt <- readRDS("/Users/jianingjin/Desktop/IEMS_304/lab4/Trp63.tf.rds")
head(dt)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-6
library(Matrix)
dim(dt)
## [1] 2177  226
#Part 1 LASSO Regression
#(1)
x <- model.matrix(Trp63~., data = dt)[, -1]
y <- dt$Trp63
#?
grid <- 10^seq(2, -2, length = 100)
set.seed(1)
train <- sample(1:nrow(x), nrow(x)/2) #x training
test <- (-train) #x testing
y.test <- y[test] #y testing
set.seed(1)
cv.out <- cv.glmnet(x[train,], y[train], alpha = 1)
plot(cv.out)

bestlamda <- cv.out$lambda.min
bestlamda
## [1] 0.01543888

The which() function in R returns the position or the index of the value which satisfies the given condition. The which() function in R gives you the position of the value in a logical vector. The position can be of anything like rows, columns and even vector as well.

#(2)
lasso.mod <- glmnet(x[train,], y[train], alpha = 1, lambda = bestlamda)
lasso.mod$beta[which(lasso.mod$beta != 0),] #beta: coefficients
##           Msc        Tfap2b          Elk4          Elf3         Prrx1 
## -0.0909504492  0.3699335151  0.0018946777  0.6372448826 -0.0487509892 
##        Pou2f1          Rxra         Lmx1b          Lhx6         Tgif2 
##  0.0428647152  0.0129416437 -0.0214637574  0.0547086012  0.0589693570 
##         Snai1        Tfap2c          Zic3          Arnt          Lef1 
## -0.0012158317  0.1206164588 -0.0013523241 -0.0004506849  0.0577618544 
##         Creb3          Nfib           Jun        Pou3f1         Trp73 
## -0.0035293109  0.0522551196 -0.0001425593  0.0445560735  0.2579008019 
##          Dlx5        Zfp384          Relb         Tead2         Prdm4 
## -0.0342430624  0.0332265685  0.0060412693 -0.0259555088  0.0051240374 
##          Jund          Elf1          Klf5         Sox21          Ebf1 
## -0.0264334912  0.0339263698  0.1186651846  0.1898508806 -0.0146083753 
##         Hoxb7         Hoxb6          Sox9         Rreb1        Tfap2a 
##  0.0384285421  0.0007389748  0.0636615812  0.0294644718  0.2482972521 
##          Osr1         Grhl1           Fos          Batf         Grhl2 
## -0.0102513093  0.0603104659 -0.0071014984 -0.0407117144  0.0246994250 
##          Atf4         Nr4a1         Hoxc9         Tfap4         Bach1 
## -0.0059761315 -0.0072815310 -0.0246557693  0.0379373153  0.0031395271 
##          Rxrb         Runx2          Zeb1        Pou4f3         Fosl1 
## -0.0094055227 -0.0013470158 -0.0085562940 -0.3690101145 -0.0096505656 
##          Klf9         Foxp1 
## -0.0377028620  0.0126803302
#(3)
lasso.pred <- predict(lasso.mod, s = bestlamda, x[test, ])
mean((lasso.pred - y.test)^2)
## [1] 0.1214215
#Part 2: Decision Trees
#Q1
library(tree)
dt2 = data.frame(dt) #alter dataset to dataframe
model.tree = tree(Trp63 ~ ., data = dt2, subset = train)
summary(model.tree)
## 
## Regression tree:
## tree(formula = Trp63 ~ ., data = dt2, subset = train)
## Variables actually used in tree construction:
## [1] "Tfap2b" "Tfap2a" "Klf5"   "Prrx1"  "Max"    "Nfib"   "Klf4"   "Tead2" 
## [9] "Hsf1"  
## Number of terminal nodes:  11 
## Residual mean deviance:  0.09911 = 106.7 / 1077 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.31900 -0.04744 -0.04744  0.00000 -0.04744  2.03200
#Q2
plot(model.tree)
text(model.tree, pretty = 0)

The par() function is used to set or query graphical parameters. We can divide the frame into the desired grid, add a margin to the plot or change the background color of the frame by using the par() function. We can use the par() function in R to create multiple plots at once.

#Q3
set.seed(1)
cv_out = cv.tree(model.tree)
par(mfrow = c(1, 2)) #function about lay out, makes plot in a same page
#plot(cv_out$dev, cv.out$size, type = "b")
#plot(cv_out$dev, cv.out$k, type = "b")
plot(cv_out$size, cv.out$dev, type = "b")
plot(cv_out$k, cv.out$dev, type = "b")

which(cv_out$dev == min(cv_out$dev))
## [1] 8
min(cv_out$dev)
## [1] 154.6983
fit_prune <- prune.tree(model.tree, which(cv_out$dev == min(cv_out$dev)))
summary(fit_prune)
## 
## Regression tree:
## snip.tree(tree = model.tree, nodes = c(4L, 6L, 5L))
## Variables actually used in tree construction:
## [1] "Tfap2b" "Tfap2a" "Prrx1" 
## Number of terminal nodes:  4 
## Residual mean deviance:  0.1294 = 140.2 / 1084 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.03800 -0.05904 -0.05904  0.00000 -0.05904  2.02000
plot(fit_prune)
text(fit_prune, pretty = 0)

pred_prune = predict(fit_prune, dt[-train, ])
test = dt[-train, "Trp63"]
plot(pred_prune, test)
abline(0, 1)

mean((pred_prune - test)^2)
## [1] 0.1416439
#Q4
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1)
fit_bag = randomForest(Trp63~., data = dt, subset = train, mtry = 225, importance = T)
pred_bag = predict(fit_bag, dt[-train,])
plot(pred_bag, test)
abline(0, 1)

mean((pred_bag - test)^2)
## [1] 0.1272901
#Q5
set.seed(1)
fit_rf = randomForest(
  Trp63~., data = dt, subset = train, mtry = 76, importance = T
)
pred_rf = predict(fit_rf, dt[-train, ])
mean((pred_rf - test)^2)
## [1] 0.1116688
#Q6
head(importance(fit_rf))
##           %IncMSE IncNodePurity
## Msc     1.0010015   0.004599266
## Tfap2b 44.2906374  72.363696189
## Pou3f3  0.4013403   0.063827035
## Stat1  -1.0432472   0.250776582
## Creb1  -0.8680557   0.664063886
## Elk4    9.9060221   3.465068850

Conclusion Q1: In reality, we would not focus only on one gene (e.g., Trp63) in our research. If we want to extend one method to build a network including all other genes of our interests, from the results above, which method would you choose to further apply? Why? A: RandomForest. Beacuse this method has the smallest MSE.

Q2: After trying out all these regression methods, do you observe any consistency in the regression results? What would you propose to be the most significant regulator genes for our gene of interests Trp63? A: Tfap2b In Lasso regression, Tfap2b has the largest coefficient. In Tree method, Tfap2b is in the first decision node every time. In RandomForest, Tfap2b has the largest importance.