This is how R can be used to manipulate map data

library(leaflet)
m = leaflet() %>% addTiles()%>% 
  addPopups(-85.74571, 38.24655, 'Here is the <b>Where The Final Biostats Project Comes From !!!</b>')
m

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Total Discharges for Diabetes By City(Kentucky)

library(ggvis)
library(dplyr)
library(htmltools)
library(htmlwidgets)
library(metricsgraphics)
library(RColorBrewer)
library(DT)
setwd('./..')
hosp<-read.csv('hospi.csv')
colnames(hosp)[1] = "Disease"
## total discharges for diabetes for kentucky hospitals
hosp %>% filter(Disease=="638 - DIABETES W CC" & Provider.State=="KY") %>% 
  arrange(desc(Total.Discharges))  %>%  
  mjs_plot(x=Total.Discharges, y=Provider.Name, width=800, height=800) %>%
  mjs_bar() %>%
  mjs_axis_x(xax_format = 'plain')

AVERAGE COST BY DISEASE

avg<-hosp %>% group_by(Disease) %>% 
  summarise(mean(Average.Medicare.Payments)) %>% 
  data.frame
##rounding off the average values
avg$mean.Average.Medicare.Payments.<-round(avg$mean.Average.Medicare.Payments.,digits=0)
##arranging the values in descending order and printing a datatable
avg %>% 
 arrange(desc(mean.Average.Medicare.Payments.)) %>%
  mjs_plot(y=Disease,x=mean.Average.Medicare.Payments.,height=1000,width=500) %>%
  mjs_bar() %>%
  mjs_axis_x(xax_format = 'plain')

##Trying support vector machines from Stanford online class
set.seed(10111)
x=matrix(rnorm(40),20,2)
y=rep(c(-1,1),c(10,10))

Survival Analysis in R

library(survival)
setwd('./..')
lung<-read.csv('LungCancer.csv')
fit <- survfit(Surv(time, status,type="right") ~ 1, data = lung)
plot(fit,conf.int=T,mark.time=T)

quantile(fit,c(0.25,0.5,0.75))
## $quantile
##  25  50  75 
## 170 310 550 
## 
## $lower
##  25  50  75 
## 145 285 460 
## 
## $upper
##  25  50  75 
## 197 363 654
##kaplan meier by ecog
fit <- survfit(Surv(time, status,type="right") ~ ph.ecog, data=lung)
plot(fit, conf.int=FALSE, mark.time=TRUE,lty=1:3,col=1:3)
legend("bottomleft",legend=c("0","1","2+"),title="ph.ecog",col=1:3,lty=1:3,bty="n")

##comparing survival functions
survdiff(Surv(time,status) ~ ph.ecog,rho=0,data=lung)
## Call:
## survdiff(formula = Surv(time, status) ~ ph.ecog, data = lung, 
##     rho = 0)
## 
##              N Observed Expected (O-E)^2/E (O-E)^2/V
## ph.ecog=     1        1    0.101    7.9932    8.0533
## ph.ecog=0   63       37   54.402    5.5663    8.4079
## ph.ecog=1  113       82   83.995    0.0474    0.0972
## ph.ecog=2+  51       45   26.502   12.9114   15.5475
## 
##  Chisq= 26.9  on 3 degrees of freedom, p= 6.09e-06
##cox proportional hazards
test<-coxph(Surv(time,status) ~ ph.karno, method="efron",data=lung)

##test of the porportional hazards assumption for this model
cox.zph(test, transform="km", global=TRUE)
##            rho chisq      p
## ph.karno 0.232  7.95 0.0048

Classification in R

require(ISLR)
library(utils)
##Logistic Regression
glm.fit<-glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5,binomial,Smarket)
glm.fit %>%
  summary
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5, family = binomial, 
##     data = Smarket)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.376  -1.204   1.071   1.145   1.347  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.074163   0.056674   1.309    0.191
## Lag1        -0.071325   0.050104  -1.424    0.155
## Lag2        -0.044136   0.050025  -0.882    0.378
## Lag3         0.009229   0.049879   0.185    0.853
## Lag4         0.007211   0.049898   0.145    0.885
## Lag5         0.009311   0.049490   0.188    0.851
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1728.3  on 1244  degrees of freedom
## AIC: 1740.3
## 
## Number of Fisher Scoring iterations: 3
glm.probs<-predict(glm.fit,type="response")
glm.pred=ifelse(glm.probs>0.5,"Up","Down")
attach(Smarket)
table(glm.pred,Direction)
##         Direction
## glm.pred Down  Up
##     Down  116  98
##     Up    486 550
mean(glm.pred==Direction)
## [1] 0.5328
##Make Training and Test sets
train=Year<2005
glm.fit<-glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume,family=binomial,data=Smarket,subset=train)
glm.probs<-predict(glm.fit,
                   newdata=Smarket[!train,],type="response")
glm.pred=ifelse(glm.probs>0.5,"Up","Down")
Direction.2005<-Smarket$Direction[!train]
table(glm.pred,Direction.2005)
##         Direction.2005
## glm.pred Down Up
##     Down   77 97
##     Up     34 44
mean(glm.pred==Direction.2005)
## [1] 0.4801587
##Fit smaller model
glm.fit<-glm(Direction ~ Lag1+Lag2,family=binomial,data=Smarket,subset=train)
glm.probs<-predict(glm.fit,
                   newdata=Smarket[!train,],type="response")
glm.pred=ifelse(glm.probs>0.5,"Up","Down")
Direction.2005<-Smarket$Direction[!train]
table(glm.pred,Direction.2005)
##         Direction.2005
## glm.pred Down  Up
##     Down   35  35
##     Up     76 106
mean(glm.pred==Direction.2005)
## [1] 0.5595238

Decision Trees

require(tree)
## Loading required package: tree
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'tree'
attach(Carseats)
hist(Sales)

Which are the busiest months in NYC airports(use the arrows to manipulate the table)

library(nycflights13)
##Scatterplot matrix
##pairs(~air_time+distance+hour+dep_delay+arr_delay,data=flights, 
  ##    main="Simple Scatterplot Matrix")
##which months are the busiest to fly in
month<-table(flights$month) %>% data.frame
colnames(month)<-c("Months","Count") 
datatable(month)

which days are the busiest days in the month for flight in NYC

day<-table(flights$day) %>% data.frame
colnames(day)<-c("Days Of The Month","Count")
datatable(day)

Which are the commonest arrival times at destinations

## at what time do people like to reach destination
arrt<-table(flights$arr_time) %>% data.frame
colnames(arrt)<-c("Arrvial Times","Count")
datatable(arrt)

Calculating chisquare test(expected values)

a<-c(8,2)
b<-c(2,8)
dat<-rbind(a,b)
dat<-as.table(dat)
##calculating the expected
expected <- as.array(margin.table(dat,1)) %*% t(as.array(margin.table(dat,2))) / margin.table(dat)
expected
##   A B
## a 5 5
## b 5 5
##calculating chisquare(the total of observed minus expected)
chi <- sum((expected - as.array(dat))^2/expected)
chi
## [1] 7.2
##getting pvalue of this statistic
1-pchisq(chi,df=4)
## [1] 0.1256891

Principal Components in R

We will use ‘USArrests’ data which is in R.

dimnames(USArrests)
## [[1]]
##  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
##  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
##  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
## [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
## [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
## [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
## [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
## [29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
## [33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
## [37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
## [41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
## [45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
## [49] "Wisconsin"      "Wyoming"       
## 
## [[2]]
## [1] "Murder"   "Assault"  "UrbanPop" "Rape"
##summary function to each column
USArrests %>% summarise_each(funs(mean))
##   Murder Assault UrbanPop   Rape
## 1  7.788  170.76    65.54 21.232
##variance of each column
USArrests %>% summarise_each(funs(var))
##     Murder  Assault UrbanPop     Rape
## 1 18.97047 6945.166 209.5188 87.72916

We see that Assault has a much larger variance than the other variables. It would dominate the principal components, so we choose to standardize the components when we perform PCA.

pca.out=prcomp(USArrests,scale=TRUE)
pca.out
## Standard deviations:
## [1] 1.5748783 0.9948694 0.5971291 0.4164494
## 
## Rotation:
##                 PC1        PC2        PC3         PC4
## Murder   -0.5358995  0.4181809 -0.3412327  0.64922780
## Assault  -0.5831836  0.1879856 -0.2681484 -0.74340748
## UrbanPop -0.2781909 -0.8728062 -0.3780158  0.13387773
## Rape     -0.5434321 -0.1673186  0.8177779  0.08902432
names(pca.out)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
biplot(pca.out,scale=.6)

Hospital Charge Data

##trying to make a function to get hospitals by city from hosp data frame
getHospitalCharges <-function(city,state){
  city<-toupper(city);
  state<-toupper(state)
  hosp %>%
    filter(Provider.City==city & Provider.State==state) %>%
    select(Disease,Provider.Name,Average.Total.Payments) %>% 
    arrange(desc(Average.Total.Payments)) %>%
    datatable %>%
    return
}
getHospitalCharges('dayton','oh')

 hosp %>%
  select(Disease,Total.Discharges) %>% group_by(Disease) %>% summarise(mean(Total.Discharges)) %>% data.frame %>%  arrange(desc(mean.Total.Discharges.)) %>%
  datatable