Load libraries and retreive data from remote URL
# Load libraries RCurl -> for loading data from URL, plotly -> for making plots interactive, ggrepel -> for Text Labels
library(RCurl)
## Loading required package: bitops
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:graphics':
##
## layout
library(ggrepel)
# Load Data from Remote URL
rData <- getURL('http://vincentarelbundock.github.io/Rdatasets/csv/HSAUR/water.csv')
# Read CSV data in lData (local data), test connection , header by default is true
lData <- read.csv(textConnection(rData))
# View Data
knitr::kable(lData)
X | location | town | mortality | hardness |
---|---|---|---|---|
1 | South | Bath | 1247 | 105 |
2 | North | Birkenhead | 1668 | 17 |
3 | South | Birmingham | 1466 | 5 |
4 | North | Blackburn | 1800 | 14 |
5 | North | Blackpool | 1609 | 18 |
6 | North | Bolton | 1558 | 10 |
7 | North | Bootle | 1807 | 15 |
8 | South | Bournemouth | 1299 | 78 |
9 | North | Bradford | 1637 | 10 |
10 | South | Brighton | 1359 | 84 |
11 | South | Bristol | 1392 | 73 |
12 | North | Burnley | 1755 | 12 |
13 | South | Cardiff | 1519 | 21 |
14 | South | Coventry | 1307 | 78 |
15 | South | Croydon | 1254 | 96 |
16 | North | Darlington | 1491 | 20 |
17 | North | Derby | 1555 | 39 |
18 | North | Doncaster | 1428 | 39 |
19 | South | East Ham | 1318 | 122 |
20 | South | Exeter | 1260 | 21 |
21 | North | Gateshead | 1723 | 44 |
22 | North | Grimsby | 1379 | 94 |
23 | North | Halifax | 1742 | 8 |
24 | North | Huddersfield | 1574 | 9 |
25 | North | Hull | 1569 | 91 |
26 | South | Ipswich | 1096 | 138 |
27 | North | Leeds | 1591 | 16 |
28 | South | Leicester | 1402 | 37 |
29 | North | Liverpool | 1772 | 15 |
30 | North | Manchester | 1828 | 8 |
31 | North | Middlesbrough | 1704 | 26 |
32 | North | Newcastle | 1702 | 44 |
33 | South | Newport | 1581 | 14 |
34 | South | Northampton | 1309 | 59 |
35 | South | Norwich | 1259 | 133 |
36 | North | Nottingham | 1427 | 27 |
37 | North | Oldham | 1724 | 6 |
38 | South | Oxford | 1175 | 107 |
39 | South | Plymouth | 1486 | 5 |
40 | South | Portsmouth | 1456 | 90 |
41 | North | Preston | 1696 | 6 |
42 | South | Reading | 1236 | 101 |
43 | North | Rochdale | 1711 | 13 |
44 | North | Rotherham | 1444 | 14 |
45 | North | St Helens | 1591 | 49 |
46 | North | Salford | 1987 | 8 |
47 | North | Sheffield | 1495 | 14 |
48 | South | Southampton | 1369 | 68 |
49 | South | Southend | 1257 | 50 |
50 | North | Southport | 1587 | 75 |
51 | North | South Shields | 1713 | 71 |
52 | North | Stockport | 1557 | 13 |
53 | North | Stoke | 1640 | 57 |
54 | North | Sunderland | 1709 | 71 |
55 | South | Swansea | 1625 | 13 |
56 | North | Wallasey | 1625 | 20 |
57 | South | Walsall | 1527 | 60 |
58 | South | West Bromwich | 1627 | 53 |
59 | South | West Ham | 1486 | 122 |
60 | South | Wolverhampton | 1485 | 81 |
61 | North | York | 1378 | 71 |
# Data Summary
summary(lData)
## X location town mortality hardness
## Min. : 1 North:35 Bath : 1 Min. :1096 Min. : 5.00
## 1st Qu.:16 South:26 Birkenhead: 1 1st Qu.:1379 1st Qu.: 14.00
## Median :31 Birmingham: 1 Median :1555 Median : 39.00
## Mean :31 Blackburn : 1 Mean :1524 Mean : 47.18
## 3rd Qu.:46 Blackpool : 1 3rd Qu.:1668 3rd Qu.: 75.00
## Max. :61 Bolton : 1 Max. :1987 Max. :138.00
## (Other) :55
# Scatter Diagram 1
b <- ggplot(lData, aes(x = mortality , y = hardness))
b + geom_point(aes(color = hardness, size = mortality, shape = factor(location))) + scale_colour_gradient(low = "purple")
# Lets make it interactive
ggplotly()
In the above scatter plot we can see the clear distinction between triangles and circle. The traingles have higher Hardness rate and the Circles have a higher mortality rate.
# Scatter Diagram 2
ggplot(subset(lData, location %in% c("North", "South")),
aes(x=mortality, y=hardness, color=location)) + geom_point() + geom_smooth() + geom_text_repel(aes(label=town), size = 3)
In the above scatter plot we can very well see that the North has a higher mortality rate than the South. We can also see that the twon Ipswich has the lowest mortality rate (1096) and the town Salford has the highest mortality rate of 1987.
# Create Histogram for Mortality
ggplot(data=lData, aes(lData$mortality)) +
geom_histogram(aes(y =..density.., fill=..count..), breaks=seq(1000, 2000, by = 100), alpha = .5 ) +
geom_density(col=1) + labs(title="Histogram for Mortality", x="Mortality", y="Count") +
geom_vline(aes(xintercept=mean(mortality)), color="blue", linetype="dashed", size=1)
The above histogram shows that the maximum frequency of Mortality Rate is between 1300 to 1600. the dashed blue line represent Mean of the distribution.
ggplot(data=lData, aes(lData$hardness)) +
geom_histogram(aes(y =..density.., fill=..count..), breaks=seq(0, 150, by = 10), alpha = .5 ) +
geom_density(col=2) + labs(title="Histogram for Water Hardness", x="Water Hardness", y="Count") +
geom_vline(aes(xintercept=mean(hardness)), color="blue", linetype="dashed", size=1)
As we can see in the histogram, the frequncy of water hardness between 10 to 20 is maximum and the dashed blue line represent Mean of the distribution.
# North vs South Mortality Histogram
ggplot(lData, aes(x=mortality, color=location, fill=location)) +
geom_histogram(position="identity",breaks=seq(1000, 2000, by = 100), alpha=0.5)+
scale_color_manual(values=c("#999999", "#E69F00", "#56B4E9"))+
scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9"))+
labs(title="Mortality histogram plot",x="Mortality", y = "Count")
# Lets make it interactive
ggplotly()
The Mortality Rate for South Region lies between 1000 to 1700 whereas the Mortslity Rate of North Region lies between 1300 to 2000.
# North vs South Water Hardness Histogram
ggplot(lData, aes(x=hardness, color=location, fill=location)) +
geom_histogram(position="identity",breaks=seq(0, 150, by = 10), alpha=0.5)+
scale_color_manual(values=c("#999999", "#BEE181", "#5B6BAA"))+
scale_fill_manual(values=c("#999999", "#BEE181", "#5B6BAA"))+
labs(title="Water Hardness histogram plot",x="hardness", y = "Count")
# Lets make it interactive
ggplotly()
The North region has highest density of hardness water rate with low value whereas the South region has Water Hardness rate with high value.
ggplot(lData, aes(x=mortality, y=hardness)) + geom_bar(stat = "identity") + geom_smooth()
The above bar plot reveals that as the Water Hardness rate is inversely proportional to Mortality Rate i.e as hardness decreases, mortality increases.
# Dotplot: Grouped Sorted and Colored
# Sort by mortality, group and color by location
xm <- lData[order(lData$mortality),] # sort by mortality
xm$loc <- factor(xm$location) # it must be a factor
xm$color[xm$location=="North"] <- "red"
xm$color[xm$location=="South"] <- "blue"
dotchart(xm$mortality,labels=row.names(xm),cex=.50,groups= xm$location,
main="Mortality rates grouped by location",
xlab="Mortality rate", gcolor="green", color=xm$color)
# Dotplot: Grouped Sorted and Colored
# Sort by hardness, group and color by location
wHard <- lData[order(lData$hardness),] # sort by hardness
wHard$loc <- factor(wHard$location) # it must be a factor
wHard$color[wHard$location=="North"] <- "red"
wHard$color[wHard$location=="South"] <- "blue"
dotchart(wHard$hardness,labels=row.names(xm),cex=.50,groups= wHard$location,
main="Water Hardness grouped by location",
xlab="Water Hardness", gcolor="Gray", color=wHard$color)
#Mortality Boxplot based on Location
boxplot(lData$mortality~lData$location,data=lData, main="Mortality data",
xlab="Location", ylab="Mortality data")
The boxplot clearly shows Mortality Rate is comparatively higher in North Region when compared with South. Observe the Median of the both the Region.
The middle 50% of the Mortality Rate in North Region lies between 1550 to 1700, whereas the median of the data of North Region is little higher than 1600. As observed, we can see only one data point as outlier.
The median of the moertality rate in South Region is littler higher than 1300. The middle 50% lies between 1250 to 1500.
# Water Hardness based on Location
boxplot(lData$hardness~lData$location,data=lData, main="Water Hardness",
xlab="Location", ylab="Water Hardness")
Water Hardness in the North region is skewed right i.e most of the values are greater than median, whereas Water Hardness in the South region is skewed left i.e most of the values are lower than the median.
# Hardness vs Mortality Boxplot based on Location
ggplot(lData, aes(x=mortality, y=hardness, color=location)) + geom_boxplot()
## Warning: position_dodge requires non-overlapping x intervals