library(leaflet)
m = leaflet() %>% addTiles()%>%
addPopups(-85.74571, 38.24655, 'Here is the <b>Where The Final Biostats Project Comes From !!!</b>')
m
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(ggvis)
library(dplyr)
library(htmltools)
library(htmlwidgets)
library(metricsgraphics)
library(RColorBrewer)
library(DT)
setwd('./..')
hosp<-read.csv('hospi.csv')
colnames(hosp)[1] = "Disease"
## total discharges for diabetes for kentucky hospitals
hosp %>% filter(Disease=="638 - DIABETES W CC" & Provider.State=="KY") %>%
arrange(desc(Total.Discharges)) %>%
mjs_plot(x=Total.Discharges, y=Provider.Name, width=800, height=800) %>%
mjs_bar() %>%
mjs_axis_x(xax_format = 'plain')
avg<-hosp %>% group_by(Disease) %>%
summarise(mean(Average.Medicare.Payments)) %>%
data.frame
##rounding off the average values
avg$mean.Average.Medicare.Payments.<-round(avg$mean.Average.Medicare.Payments.,digits=0)
##arranging the values in descending order and printing a datatable
avg %>%
arrange(desc(mean.Average.Medicare.Payments.)) %>%
mjs_plot(y=Disease,x=mean.Average.Medicare.Payments.,height=1000,width=500) %>%
mjs_bar() %>%
mjs_axis_x(xax_format = 'plain')
##Trying support vector machines from Stanford online class
set.seed(10111)
x=matrix(rnorm(40),20,2)
y=rep(c(-1,1),c(10,10))
library(survival)
setwd('./..')
lung<-read.csv('LungCancer.csv')
fit <- survfit(Surv(time, status,type="right") ~ 1, data = lung)
plot(fit,conf.int=T,mark.time=T)
quantile(fit,c(0.25,0.5,0.75))
## $quantile
## 25 50 75
## 170 310 550
##
## $lower
## 25 50 75
## 145 285 460
##
## $upper
## 25 50 75
## 197 363 654
##kaplan meier by ecog
fit <- survfit(Surv(time, status,type="right") ~ ph.ecog, data=lung)
plot(fit, conf.int=FALSE, mark.time=TRUE,lty=1:3,col=1:3)
legend("bottomleft",legend=c("0","1","2+"),title="ph.ecog",col=1:3,lty=1:3,bty="n")
##comparing survival functions
survdiff(Surv(time,status) ~ ph.ecog,rho=0,data=lung)
## Call:
## survdiff(formula = Surv(time, status) ~ ph.ecog, data = lung,
## rho = 0)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## ph.ecog= 1 1 0.101 7.9932 8.0533
## ph.ecog=0 63 37 54.402 5.5663 8.4079
## ph.ecog=1 113 82 83.995 0.0474 0.0972
## ph.ecog=2+ 51 45 26.502 12.9114 15.5475
##
## Chisq= 26.9 on 3 degrees of freedom, p= 6.09e-06
##cox proportional hazards
test<-coxph(Surv(time,status) ~ ph.karno, method="efron",data=lung)
##test of the porportional hazards assumption for this model
cox.zph(test, transform="km", global=TRUE)
## rho chisq p
## ph.karno 0.232 7.95 0.0048
require(ISLR)
library(utils)
##Logistic Regression
glm.fit<-glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5,binomial,Smarket)
glm.fit %>%
summary
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5, family = binomial,
## data = Smarket)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.376 -1.204 1.071 1.145 1.347
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.074163 0.056674 1.309 0.191
## Lag1 -0.071325 0.050104 -1.424 0.155
## Lag2 -0.044136 0.050025 -0.882 0.378
## Lag3 0.009229 0.049879 0.185 0.853
## Lag4 0.007211 0.049898 0.145 0.885
## Lag5 0.009311 0.049490 0.188 0.851
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.2 on 1249 degrees of freedom
## Residual deviance: 1728.3 on 1244 degrees of freedom
## AIC: 1740.3
##
## Number of Fisher Scoring iterations: 3
glm.probs<-predict(glm.fit,type="response")
glm.pred=ifelse(glm.probs>0.5,"Up","Down")
attach(Smarket)
table(glm.pred,Direction)
## Direction
## glm.pred Down Up
## Down 116 98
## Up 486 550
mean(glm.pred==Direction)
## [1] 0.5328
##Make Training and Test sets
train=Year<2005
glm.fit<-glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume,family=binomial,data=Smarket,subset=train)
glm.probs<-predict(glm.fit,
newdata=Smarket[!train,],type="response")
glm.pred=ifelse(glm.probs>0.5,"Up","Down")
Direction.2005<-Smarket$Direction[!train]
table(glm.pred,Direction.2005)
## Direction.2005
## glm.pred Down Up
## Down 77 97
## Up 34 44
mean(glm.pred==Direction.2005)
## [1] 0.4801587
##Fit smaller model
glm.fit<-glm(Direction ~ Lag1+Lag2,family=binomial,data=Smarket,subset=train)
glm.probs<-predict(glm.fit,
newdata=Smarket[!train,],type="response")
glm.pred=ifelse(glm.probs>0.5,"Up","Down")
Direction.2005<-Smarket$Direction[!train]
table(glm.pred,Direction.2005)
## Direction.2005
## glm.pred Down Up
## Down 35 35
## Up 76 106
mean(glm.pred==Direction.2005)
## [1] 0.5595238
Decision Trees
require(tree)
## Loading required package: tree
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'tree'
attach(Carseats)
hist(Sales)
library(nycflights13)
##Scatterplot matrix
##pairs(~air_time+distance+hour+dep_delay+arr_delay,data=flights,
## main="Simple Scatterplot Matrix")
##which months are the busiest to fly in
month<-table(flights$month) %>% data.frame
colnames(month)<-c("Months","Count")
datatable(month)
day<-table(flights$day) %>% data.frame
colnames(day)<-c("Days Of The Month","Count")
datatable(day)
## at what time do people like to reach destination
arrt<-table(flights$arr_time) %>% data.frame
colnames(arrt)<-c("Arrvial Times","Count")
datatable(arrt)
a<-c(8,2)
b<-c(2,8)
dat<-rbind(a,b)
dat<-as.table(dat)
##calculating the expected
expected <- as.array(margin.table(dat,1)) %*% t(as.array(margin.table(dat,2))) / margin.table(dat)
expected
## A B
## a 5 5
## b 5 5
##calculating chisquare(the total of observed minus expected)
chi <- sum((expected - as.array(dat))^2/expected)
chi
## [1] 7.2
##getting pvalue of this statistic
1-pchisq(chi,df=4)
## [1] 0.1256891
We will use ‘USArrests’ data which is in R.
dimnames(USArrests)
## [[1]]
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Connecticut" "Delaware"
## [9] "Florida" "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Iowa" "Kansas"
## [17] "Kentucky" "Louisiana" "Maine" "Maryland"
## [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
## [25] "Missouri" "Montana" "Nebraska" "Nevada"
## [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [41] "South Dakota" "Tennessee" "Texas" "Utah"
## [45] "Vermont" "Virginia" "Washington" "West Virginia"
## [49] "Wisconsin" "Wyoming"
##
## [[2]]
## [1] "Murder" "Assault" "UrbanPop" "Rape"
##summary function to each column
USArrests %>% summarise_each(funs(mean))
## Murder Assault UrbanPop Rape
## 1 7.788 170.76 65.54 21.232
##variance of each column
USArrests %>% summarise_each(funs(var))
## Murder Assault UrbanPop Rape
## 1 18.97047 6945.166 209.5188 87.72916
We see that Assault has a much larger variance than the other variables. It would dominate the principal components, so we choose to standardize the components when we perform PCA.
pca.out=prcomp(USArrests,scale=TRUE)
pca.out
## Standard deviations:
## [1] 1.5748783 0.9948694 0.5971291 0.4164494
##
## Rotation:
## PC1 PC2 PC3 PC4
## Murder -0.5358995 0.4181809 -0.3412327 0.64922780
## Assault -0.5831836 0.1879856 -0.2681484 -0.74340748
## UrbanPop -0.2781909 -0.8728062 -0.3780158 0.13387773
## Rape -0.5434321 -0.1673186 0.8177779 0.08902432
names(pca.out)
## [1] "sdev" "rotation" "center" "scale" "x"
biplot(pca.out,scale=.6)
##trying to make a function to get hospitals by city from hosp data frame
getHospitalCharges <-function(city,state){
city<-toupper(city);
state<-toupper(state)
hosp %>%
filter(Provider.City==city & Provider.State==state) %>%
select(Disease,Provider.Name,Average.Total.Payments) %>%
arrange(desc(Average.Total.Payments)) %>%
datatable %>%
return
}
getHospitalCharges('dayton','oh')
hosp %>%
select(Disease,Total.Discharges) %>% group_by(Disease) %>% summarise(mean(Total.Discharges)) %>% data.frame %>% arrange(desc(mean.Total.Discharges.)) %>%
datatable