This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
##########################################
# section 3.2 Exploratory Data Analysis
##########################################
# Figure 3-5
x <- rnorm(50)
y <- x + rnorm(50, mean=0, sd=0.5)
data <- as.data.frame(cbind(x, y))
summary(data)
## x y
## Min. :-2.0148 Min. :-2.4622
## 1st Qu.:-0.7516 1st Qu.:-0.9408
## Median :-0.2601 Median :-0.3122
## Mean :-0.1296 Mean :-0.2057
## 3rd Qu.: 0.4498 3rd Qu.: 0.5645
## Max. : 2.3995 Max. : 2.3859
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.2
ggplot(data, aes(x=x, y=y)) +
geom_point(size=2) +
ggtitle("Scatterplot of X and Y") +
theme(axis.text=element_text(size=12),
axis.title = element_text(size=14),
plot.title = element_text(size=20, face="bold"))
##########################################
# section 3.2.1 Visualization Before Analysis
##########################################
library(ggplot2)
data(anscombe)
anscombe
## x1 x2 x3 x4 y1 y2 y3 y4
## 1 10 10 10 8 8.04 9.14 7.46 6.58
## 2 8 8 8 8 6.95 8.14 6.77 5.76
## 3 13 13 13 8 7.58 8.74 12.74 7.71
## 4 9 9 9 8 8.81 8.77 7.11 8.84
## 5 11 11 11 8 8.33 9.26 7.81 8.47
## 6 14 14 14 8 9.96 8.10 8.84 7.04
## 7 6 6 6 8 7.24 6.13 6.08 5.25
## 8 4 4 4 19 4.26 3.10 5.39 12.50
## 9 12 12 12 8 10.84 9.13 8.15 5.56
## 10 7 7 7 8 4.82 7.26 6.42 7.91
## 11 5 5 5 8 5.68 4.74 5.73 6.89
nrow(anscombe)
## [1] 11
# generates levels to indicate which group each data point belongs to
levels <- gl(4,nrow(anscombe))
levels
## [1] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 4 4
## [36] 4 4 4 4 4 4 4 4 4
## Levels: 1 2 3 4
# Group anscombe into a data frame
mydata <- with(anscombe,data.frame(x=c(x1,x2,x3,x4), y=c(y1,y2,y3,y4), mygroup=levels))
mydata
## x y mygroup
## 1 10 8.04 1
## 2 8 6.95 1
## 3 13 7.58 1
## 4 9 8.81 1
## 5 11 8.33 1
## 6 14 9.96 1
## 7 6 7.24 1
## 8 4 4.26 1
## 9 12 10.84 1
## 10 7 4.82 1
## 11 5 5.68 1
## 12 10 9.14 2
## 13 8 8.14 2
## 14 13 8.74 2
## 15 9 8.77 2
## 16 11 9.26 2
## 17 14 8.10 2
## 18 6 6.13 2
## 19 4 3.10 2
## 20 12 9.13 2
## 21 7 7.26 2
## 22 5 4.74 2
## 23 10 7.46 3
## 24 8 6.77 3
## 25 13 12.74 3
## 26 9 7.11 3
## 27 11 7.81 3
## 28 14 8.84 3
## 29 6 6.08 3
## 30 4 5.39 3
## 31 12 8.15 3
## 32 7 6.42 3
## 33 5 5.73 3
## 34 8 6.58 4
## 35 8 5.76 4
## 36 8 7.71 4
## 37 8 8.84 4
## 38 8 8.47 4
## 39 8 7.04 4
## 40 8 5.25 4
## 41 19 12.50 4
## 42 8 5.56 4
## 43 8 7.91 4
## 44 8 6.89 4
# Make scatterplots using the ggplot2 package
theme_set(theme_bw()) # set plot color theme
# create the four plots of Figure 3-7
ggplot(mydata, aes(x,y)) +
geom_point(size=4) +
geom_smooth(method="lm", fill=NA, fullrange=TRUE) +
facet_wrap(~mygroup)
##########################################
# section 3.2.2 Dirty Data
##########################################
age <- rnorm(6000, mean=40, sd=10)
age <- c( age, runif(20, min=-2, max=0),
rep(0,400),
runif(40, min=100, max=110))
age <- round(age)
hist(age, breaks=100, main="Age Distribution of Account Holders",
xlab="Age", ylab="Frequency", col="gray")
x <- c(1, 2, 3, NA, 4)
is.na(x)
## [1] FALSE FALSE FALSE TRUE FALSE
mean(x)
## [1] NA
mean(x, na.rm=TRUE)
## [1] 2.5
DF <- data.frame(x = c(1, 2, 3), y = c(10, 20, NA))
DF
## x y
## 1 1 10
## 2 2 20
## 3 3 NA
DF1 <- na.exclude(DF)
DF1
## x y
## 1 1 10
## 2 2 20
mortgage <- rbeta(2000,2,4) * 10
mortgage <- c( mortgage, rep(10, 1000))
hist(mortgage, breaks=10, xlab="Mortgage Age", col="gray",
main="Portfolio Distribution, Years Since Origination")
##########################################
# section 3.2.3 Visualizing a Single Variable
##########################################
data(mtcars)
## Dotchart and Barplot ##
dotchart(mtcars$mpg,labels=row.names(mtcars),cex=.7,
main="Miles Per Gallon (MPG) of Car Models",
xlab="MPG")
barplot(table(mtcars$cyl), main="Distribution of Car Cylinder Counts",
xlab="Number of Cylinders")
## Histogram and Density Plot ##
# randomly generate 4000 observations from the log normal distribution
income <- rlnorm(4000, meanlog = 4, sdlog = 0.7)
summary(income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.003 33.762 54.096 69.274 87.139 567.525
income <- 1000*income
summary(income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3003 33762 54096 69274 87139 567526
# plot the histogram
hist(income, breaks=500, xlab="Income", main="Histogram of Income")
# density plot
plot(density(log10(income), adjust=0.5),
main="Distribution of Income (log10 scale)")
# add rug to the density plot
rug(log10(income))
library("ggplot2")
theme_set(theme_grey())
data(diamonds) # load the diamonds dataset from ggplot2
# Only keep the premium and ideal cuts of diamonds
niceDiamonds <- diamonds[diamonds$cut=="Premium" |
diamonds$cut=="Ideal",]
summary(niceDiamonds$cut)
## Fair Good Very Good Premium Ideal
## 0 0 0 13791 21551
# plot density plot of diamond prices
ggplot(niceDiamonds, aes(x=price, fill=cut)) +
geom_density(alpha = .3, color=NA)
# plot density plot of the log10 of diamond prices
ggplot(niceDiamonds, aes(x=log10(price), fill=cut)) +
geom_density(alpha = .3, color=NA)
##########################################
# section 3.2.4 Examining Multiple Variables
##########################################
# 75 numbers between 0 and 10 of uniform distribution
x <- runif(75, 0, 10)
x <- sort(x)
y <- 200 + x^3 - 10 * x^2 + x + rnorm(75, 0, 20)
lr <- lm(y ~ x) # linear regression
poly <- loess(y ~ x) # LOESS
fit <- predict(poly) # fit a nonlinear line
plot(x,y)
# draw the fitted line for the linear regression
points(x, lr$coefficients[1] + lr$coefficients[2] * x,
type = "l", col = 2)
# draw the fitted line with LOESS
points(x, fit, type = "l", col = 4)
## Dotchart and Barplot ##
# sort by mpg
cars <- mtcars[order(mtcars$mpg),]
# grouping variable must be a factor
cars$cyl <- factor(cars$cyl)
cars$color[cars$cyl==4] <- "red"
cars$color[cars$cyl==6] <- "blue"
cars$color[cars$cyl==8] <- "darkgreen"
dotchart(cars$mpg, labels=row.names(cars), cex=.7, groups= cars$cyl,
main="Miles Per Gallon (MPG) of Car Models\nGrouped by Cylinder",
xlab="Miles Per Gallon", color=cars$color, gcolor="black")
counts <- table(mtcars$gear, mtcars$cyl)
barplot(counts, main="Distribution of Car Cylinder Counts and Gears",
xlab="Number of Cylinders", ylab="Counts",
col=c("#0000FFFF", "#0080FFFF", "#00FFFFFF"),
legend = rownames(counts), beside=TRUE,
args.legend = list(x="top", title = "Number of Gears"))
## Box-and-Whisker Plot ##
DF <- read.csv("C:/Users/ryerrapati/Desktop/RAVI/UC/Academics/SEM 1/Big Data Analytics and Science/Week 2/zipIncome.csv", header=TRUE, sep=",")
# Remove outliers
DF <- subset(DF, DF$MeanHouseholdIncome > 7000 & DF$MeanHouseholdIncome < 200000)
summary(DF)
## MeanEducation MeanHouseholdIncome Zip1
## Min. : 0.00 Min. : 8465 Min. :0.000
## 1st Qu.:11.88 1st Qu.: 37755 1st Qu.:2.000
## Median :12.44 Median : 44234 Median :4.000
## Mean :12.56 Mean : 48465 Mean :4.474
## 3rd Qu.:13.11 3rd Qu.: 54444 3rd Qu.:7.000
## Max. :19.00 Max. :194135 Max. :9.000
library(ggplot2)
# plot the jittered scatterplot w/ boxplot
# color-code points with zip codes
# the outlier.size=0 prevents the boxplot from plotting the outlier
ggplot(data=DF, aes(x=as.factor(Zip1), y=log10(MeanHouseholdIncome))) +
geom_point(aes(color=factor(Zip1)), alpha=0.2, position="jitter") +
geom_boxplot(outlier.size=0, alpha=0.1) +
guides(colour=FALSE) +
ggtitle ("Mean Household Income by Zip Code")
# simple boxplot
boxplot(log10(MeanHouseholdIncome) ~ Zip1, data=DF)
title ("Mean Household Income by Zip Code")
## Hexbinplot for Large Datasets ##
# plot the data points
plot(log10(MeanHouseholdIncome) ~ MeanEducation, data=DF)
# add a straight fitted line of the linear regression
abline(lm(log10(MeanHouseholdIncome) ~ MeanEducation, data=DF),
col='red')
#install.packages("hexbin")
library(hexbin)
## Warning: package 'hexbin' was built under R version 3.5.2
#
# "g" adds the grid, "r" adds the regression line
# sqrt transform on the count gives more dynamic range to the shading
# inv provides the inverse transformation function of trans
#
hexbinplot(log10(MeanHouseholdIncome) ~ MeanEducation,
data=DF, trans = sqrt, inv = function(x) x^2,
type=c("g", "r"))
## Scatterplot Matrix ##
# define the colors
colors <- c("red", "green", "blue")
#colors <- c("gray50", "white", "black")
# draw the plot matrix
pairs(iris[1:4], main = "Fisher's Iris Dataset",
pch = 21, bg = colors[unclass(iris$Species)] )
# set graphical parameter to clip plotting to the figure region
par(xpd = TRUE)
# add legend
legend(0.2, 0.02, horiz = TRUE, as.vector(unique(iris$Species)),
fill = colors, bty = "n")
## Analyzing a Variable over Time ##
plot(AirPassengers)
##########################################
# section 3.2.5 Data Exploration Versus Presentation
##########################################
# Generate random log normal income data
income = rlnorm(5000, meanlog=log(40000), sdlog=log(5))
# Part I: Create the density plot
plot(density(log10(income), adjust=0.5),
main="Distribution of Account Values (log10 scale)")
# Add rug to the density plot
rug(log10(income))
# Part II: Make the histogram
# Create "log-like bins"
breaks = c(0, 1000, 5000, 10000, 50000, 100000, 5e5, 1e6, 2e7)
# Create bins and label the data
bins = cut(income, breaks, include.lowest=T,
labels = c("< 1K", "1-5K", "5-10K", "10-50K",
"50-100K", "100-500K", "500K-1M", "> 1M"))
# Plot the bins
plot(bins, main = "Distribution of Account Values",
xlab = "Account value ($ USD)",
ylab = "Number of Accounts", col="blue")