Heat Map aggregating 5 key hospital income statement items by California counties in 2009
The data comes from data.gov.
library(RCurl)
## Loading required package: bitops
require(qcc)
## Loading required package: qcc
## Package 'qcc', version 2.6
## Type 'citation("qcc")' for citing this R package in publications.
x <- getURL("https://raw.githubusercontent.com/paolomarco/data/master/Hospital_Profitability__2009-2013%207.04.03%20PM.csv")
profits<-read.csv(text=x,header = TRUE,sep = ',')
profits_2013<-subset(profits,Year==2013)
profits_2013<-subset(profits_2013,select = c('Facility.Number', 'County.Name','Income.Statement.Item','Income.Statement.Amount'))
require(reshape2)
## Loading required package: reshape2
profits_2013_wide<-dcast(profits_2013,Facility.Number+County.Name~Income.Statement.Item,value.var = 'Income.Statement.Amount')
#split the dataframe by county
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
counties <- group_by(profits_2013_wide,County.Name)
#compute mean sumary statistics for every state
counties_avg<-summarize(counties,
avg_GR_PT_REV = mean(GR_PT_REV, na.rm = TRUE),
avg_NONOP_EXP = mean(NONOP_EXP, na.rm = TRUE),
avg_NONOP_REV = mean(NONOP_REV, na.rm = TRUE),
avg_OTH_OP_REV = mean(OTH_OP_REV, na.rm = TRUE),
avg_TOT_OP_EXP = mean(TOT_OP_EXP, na.rm = TRUE))
counties_avg<-counties_avg[ which( ! counties_avg$County.Name %in% "Statewide") , ]
require(scales)
## Loading required package: scales
A<-function(x) rescale(x, to = c(0, 1), from = range(x, na.rm = TRUE, finite = TRUE))
counties_avg_scaled<-data.frame(counties_avg[1], apply(counties_avg[2:6],2, A))
names(counties_avg_scaled)<-c('County.Name','Gross Patient Revenue','Non-Operating Expense',
'Non-Operating Revenue','Other Operating Revenue',
'Total Operating Expense')
counties_avg_scaled.m<-melt(counties_avg_scaled)
## Using County.Name as id variables
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.3
p <- ggplot(counties_avg_scaled.m, aes(variable, County.Name)) + geom_tile(aes(fill = value),
colour = "white") + scale_fill_gradient(low = "white", high = "steelblue")
base_size<-7
p<-p + theme_grey(base_size = base_size) + labs(x='Income Statement Item',y='County Name') +
scale_y_discrete(expand = c(0,0)) + theme(legend.position='none',axis.ticks =
element_blank() , axis.text.x = element_text(size = base_size *
0.8, angle = 330, hjust = 0, colour = "black"))
p
It can be observed that Santa Clara is a county that has the highest income statement across all other counties. The county with the second highest is San Francisco. San Francisco is an urban area so that make sense but Santa Clara would require more investigation.
Raster image of the first 100 colors in R
require(grid)
## Loading required package: grid
grid.raster(matrix(colors()[1:100], ncol=10),
x = unit(.5, "npc"), y = unit(.5, "npc"),interpolate=FALSE)
R’s color palette is organized by shades. The matrix is filled column wise starting from the leftr. The first color is white. The last color is a shade darker than purple.
Spineplot using MTCARS
Dataset with a created indicator variable (MPG above average) to indicate if car is above or below average MPG for this dataset.
require(ggplot2)
data("mtcars")
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
average_mpg<-mean(mtcars$mpg)
mtcars1<-within(mtcars, {
mpg_abv_avg<-ifelse(mtcars$mpg>average_mpg,'Above Average','Below Average')
mpg_abv_avg<-as.factor(mpg_abv_avg)
})
with(mtcars1,spineplot(mpg_abv_avg~wt, breaks=10,main = 'Cars with above average MPG by Weight',
xlab = 'Weight in Tons',ylab = 'MPG'))
There are no cars with an MPG above average that weigh more than 3.5 tons. There are no cars with an MPG below average that weigh less than 2.5 tons. The biggest change in proportion once a car becomes heavier than 3 tons.
Average Obesity by State
Data comes from data.gov. URL in code. https://raw.githubusercontent.com/paolomarco/data/master/RISKFACTORSANDACCESSTOCARE.csv
x <- getURL("https://raw.githubusercontent.com/paolomarco/data/master/RISKFACTORSANDACCESSTOCARE.csv")
prevalence<-read.csv(text=x,header = TRUE,sep = ',',na.strings = -1111.1)
#subsetting the data to only include obesity percentages
prevalence<-subset(prevalence,select = c('State_FIPS_Code', 'County_FIPS_Code','CHSI_County_Name','CHSI_State_Name',
'CHSI_State_Abbr','Obesity'))
#replacing NA values with the average obesity rate
prevalence['Obesity'][is.na(prevalence['Obesity'])]<-mean(prevalence$Obesity,na.rm = TRUE)
#split the dataframe by county
require(dplyr)
states <- group_by(prevalence,CHSI_State_Name)
#compute mean sumary statistics for every state
states_avg<-summarize(states,
avg_obesity = mean(Obesity, na.rm = TRUE))
require(maps)
## Loading required package: maps
require(ggplot2)
require(ggthemes)
## Loading required package: ggthemes
## Warning: replacing previous import by 'grid::arrow' when loading 'ggthemes'
## Warning: replacing previous import by 'grid::unit' when loading 'ggthemes'
## Warning: replacing previous import by 'scales::alpha' when loading
## 'ggthemes'
obesity_prevalence <-data.frame(state = tolower(states_avg$CHSI_State_Name), states_avg)
states_map <-map_data("state")
p<-ggplot(obesity_prevalence, aes(map_id = state)) +
geom_map(aes(fill = avg_obesity), map = states_map) +
expand_limits(x = states_map$long, y = states_map$lat)
#move the legend to the bottom
p<-p + theme(legend.position='bottom',
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),axis.ticks = element_blank(),
axis.title = element_blank(),
axis.text = element_blank()) +
scale_fill_gradient(low="white",high = "red") +
guides(fill = guide_colorbar(barwidth = 10, barheight = .5)) +
ggtitle("Average Obesity in US States")
#add labels
cnames <-aggregate(cbind(long, lat) ~ region,
data = states_map, FUN = function(x) mean(range(x)))
cnames$angle <-0
p<-p + geom_text(data=cnames, aes(long, lat, label = region,
angle=angle, map_id =NULL), size=2.5)
p
Lousiana, Mississippi, South Carolina, and West Virginia have the highest average rate of obesity across their counties. Colorado has the lowest. California is somehwere in the middle (between 20% and 25%).
Social Network Graph
Download data at:http://www.rdatamining.com/data The Graph was reproduced using code from an r-bloggers post. URL: http://www.r-bloggers.com/an-example-of-social-network-analysis-with-r-using-package-igraph/
load("/Users/tobiamartens/Downloads/termDocMatrix.rdata")
#look at a subset of the matrix
termDocMatrix[5:10,1:20]
## Docs
## Terms 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## data 1 1 0 0 2 0 0 0 0 0 1 2 1 1 1 0 1 0 0 0
## examples 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## introduction 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## mining 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0
## network 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1
## package 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
termDocMatrix[termDocMatrix>=1] <- 1
termMatrix <- termDocMatrix %*% t(termDocMatrix)
termMatrix[5:10,5:10]
## Terms
## Terms data examples introduction mining network package
## data 53 5 2 34 0 7
## examples 5 17 2 5 2 2
## introduction 2 2 10 2 2 0
## mining 34 5 2 47 1 5
## network 0 2 2 1 17 1
## package 7 2 0 5 1 21
library(igraph)
##
## Attaching package: 'igraph'
##
## The following objects are masked from 'package:dplyr':
##
## %>%, as_data_frame, groups, union
##
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
##
## The following object is masked from 'package:base':
##
## union
g <- graph.adjacency(termMatrix, weighted=T,mode='undirected')
g <- simplify(g)
V(g)$label.cex <- 2.2 * V(g)$degree / max(V(g)$degree)+ .2
## Warning in max(V(g)$degree): no non-missing arguments to max; returning -
## Inf
V(g)$label.color <- rgb(0, 0, .2, .8)
V(g)$frame.color <- NA
egam <- (log(E(g)$weight)+.4) / max(log(E(g)$weight)+.4)
E(g)$color <- rgb(.5, .5, 0, egam)
E(g)$width <- egam
# plot the graph in layout1
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
It looks like the node, R, has the strongest edges (relationships) and the most. There is a particularly strong relationship between R, data, and mining. Introduction has relatively many edges but they are quite weak.
Sunflower plot using two normal distributions
Data: generate two normal distributions to be plotted as a sunflower plot
set.seed(1234)
require(RColorBrewer)
## Loading required package: RColorBrewer
require(hexbin)
## Loading required package: hexbin
x <- round (rnorm(1500, 0, 40),0)
y <- round (rnorm (1500, 0, 100),0)
df<-as.data.frame(cbind(x,y))
rf <- colorRampPalette(rev(brewer.pal(11,'Spectral')))
hexbinplot(y~x,colramp=rf)
The hexagons with the highest counts are located near the centroid of the data. The centroid is the mean of x and y. There is one hexagon that has a high count but is located pretty far away from the mean of y. This distribution is expected since the data are both mean 0 with a normal distribution.
Polygons on a map - City of Baltimore!
Throw some, throw some stats on that map.. This was adapted from a tutorial on r-bloggers. URL: http://www.r-bloggers.com/shapefile-polygons-plotted-on-google-maps-using-ggmap-in-r-throw-some-throw-some-stats-on-that-mappart-2/
#this code was adapted from a tutorial on r-bloggers
require(ggmap)
## Loading required package: ggmap
## Warning: package 'ggmap' was built under R version 3.2.3
require(RgoogleMaps)
## Loading required package: RgoogleMaps
CenterOfMap <- geocode("Baltimore, MD")
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Baltimore,%20MD&sensor=false
Baltimore <- get_map(c(lon=CenterOfMap$lon, lat=CenterOfMap$lat),zoom = 12, maptype = "terrain", source = "google")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=39.290385,-76.612189&zoom=12&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
BaltimoreMap <- ggmap(Baltimore)
#package for reading in shapefiles
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.0-4, (SVN revision 548)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 1.11.2, released 2015/02/10
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.2/Resources/library/rgdal/gdal
## Loaded PROJ.4 runtime: Rel. 4.9.1, 04 March 2015, [PJ_VERSION: 491]
## Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.2/Resources/library/rgdal/proj
## Linking to sp version: 1.1-1
#setting working directory where shapefiles are. Shape files can be downloaded at:
#https://data.baltimorecity.gov/Neighborhoods/Neighborhoods-Shape/ysi8-7icr
setwd('/Users/tobiamartens/Desktop/Neighborhood_202010/')
Neighborhoods <- readOGR(".","nhood_2010")
## OGR data source with driver: ESRI Shapefile
## Source: ".", layer: "nhood_2010"
## with 278 features
## It has 6 fields
#transform so it's mappable
Neighborhoods <- spTransform(Neighborhoods, CRS("+proj=longlat +datum=WGS84"))
#fortify to a dataframe so R can understand the data
library(ggplot2)
Neighborhoods <- fortify(Neighborhoods)
## Regions defined for each Polygons
BaltimoreMap <- BaltimoreMap + geom_polygon(aes(x=Neighborhoods$long, y=Neighborhoods$lat
, group=group),fill='green', size=.5,color='black', data=Neighborhoods, alpha=.25)
BaltimoreMap1 <- BaltimoreMap + theme(axis.ticks.y = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank())
BaltimoreMap1
Baltimore is a fairly symmetrical city. It does look like the neighborhoods tend to get slightly bigger as they move away from the city center. John Hopkins is located somewhat outside of the city center.
Starplot of mean values of 5 car key metrics using MTCARS grouped by Cylinder
data("mtcars")
require(dplyr)
mtcars_grouped <- group_by(mtcars,cyl)
mtcars_avg<-summarize(mtcars_grouped,
mpg = mean(mpg, na.rm = TRUE),
disp = mean(disp, na.rm = TRUE),
hp = mean(hp, na.rm = TRUE),
wt = mean(wt, na.rm = TRUE),
qsec = mean(qsec, na.rm = TRUE))
mtcars_avg<-as.data.frame(mtcars_avg)
row.names(mtcars_avg)<-c('4 Cyl','6 cyl','8 cyl')
mtcars_avg<-mtcars_avg[,2:6]
dim(mtcars_avg)
## [1] 3 5
stars(mtcars_avg[, 1:5], locations = c(0,0), key.loc = c(0, 0),
main = "Motor Trends by Number of Cylinders : stars(*, full = T)",
full = TRUE,lty = 2)
This star plot is a way to visualize multivariate data. In this case, the MTCARS data set was grouped by cylinder and all the other metrics were averaged.
Histogram with a Rug
data(mtcars)
with(mtcars, {
hist(mpg,prob=TRUE, main= 'Hist of MPG with Rug & Density')
lines(density(mpg),lwd=2,lty=1)
rug(mpg,ticksize = .03,lwd = 1, col = par("fg"))
})
The rug (ticks below the histogram) show the actual data points. The large cluster between 15 and 20 mpg is slightly surprising because it seems a little low. The wide range (10,35) is also surprising.
Fourfold plot of Surivival on the Titanic by Gender
library(RCurl)
require(qcc)
x <- getURL("https://raw.githubusercontent.com/paolomarco/data/master/train.csv")
titanic<-read.csv(text=x,sep = ',',header = TRUE)
titanic['SurviveFactor']<-ifelse(titanic$Survived==1,'Survived','Died')
require(vcd)
## Loading required package: vcd
two_way_contigency<-as.matrix(table(titanic$SurviveFactor,titanic$Sex))
fourfoldplot(two_way_contigency, std='ind.max',main = 'Fourfold Plot - Survival by Gender on the Titanic')
The fourfold plot shows the proportion of males that died on the titanic is much bigger than the proprotion of women. Each quadrant also displays the number. This plot is useful for the inspection of two dichotomous variables.