R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

##########################################
# section 3.2 Exploratory Data Analysis
##########################################

# Figure 3-5
x <- rnorm(50)
y <- x + rnorm(50, mean=0, sd=0.5)

data <- as.data.frame(cbind(x, y))
summary(data)
##        x                 y          
##  Min.   :-2.0148   Min.   :-2.4622  
##  1st Qu.:-0.7516   1st Qu.:-0.9408  
##  Median :-0.2601   Median :-0.3122  
##  Mean   :-0.1296   Mean   :-0.2057  
##  3rd Qu.: 0.4498   3rd Qu.: 0.5645  
##  Max.   : 2.3995   Max.   : 2.3859
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.2
ggplot(data, aes(x=x, y=y)) +
  geom_point(size=2) +
  ggtitle("Scatterplot of X and Y") + 
  theme(axis.text=element_text(size=12), 
        axis.title = element_text(size=14),
        plot.title = element_text(size=20, face="bold"))

##########################################
# section 3.2.1 Visualization Before Analysis
##########################################

library(ggplot2)

data(anscombe)
anscombe
##    x1 x2 x3 x4    y1   y2    y3    y4
## 1  10 10 10  8  8.04 9.14  7.46  6.58
## 2   8  8  8  8  6.95 8.14  6.77  5.76
## 3  13 13 13  8  7.58 8.74 12.74  7.71
## 4   9  9  9  8  8.81 8.77  7.11  8.84
## 5  11 11 11  8  8.33 9.26  7.81  8.47
## 6  14 14 14  8  9.96 8.10  8.84  7.04
## 7   6  6  6  8  7.24 6.13  6.08  5.25
## 8   4  4  4 19  4.26 3.10  5.39 12.50
## 9  12 12 12  8 10.84 9.13  8.15  5.56
## 10  7  7  7  8  4.82 7.26  6.42  7.91
## 11  5  5  5  8  5.68 4.74  5.73  6.89
nrow(anscombe)
## [1] 11
# generates levels to indicate which group each data point belongs to
levels <- gl(4,nrow(anscombe))
levels
##  [1] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 4 4
## [36] 4 4 4 4 4 4 4 4 4
## Levels: 1 2 3 4
# Group anscombe into a data frame
mydata <- with(anscombe,data.frame(x=c(x1,x2,x3,x4), y=c(y1,y2,y3,y4), mygroup=levels))
mydata
##     x     y mygroup
## 1  10  8.04       1
## 2   8  6.95       1
## 3  13  7.58       1
## 4   9  8.81       1
## 5  11  8.33       1
## 6  14  9.96       1
## 7   6  7.24       1
## 8   4  4.26       1
## 9  12 10.84       1
## 10  7  4.82       1
## 11  5  5.68       1
## 12 10  9.14       2
## 13  8  8.14       2
## 14 13  8.74       2
## 15  9  8.77       2
## 16 11  9.26       2
## 17 14  8.10       2
## 18  6  6.13       2
## 19  4  3.10       2
## 20 12  9.13       2
## 21  7  7.26       2
## 22  5  4.74       2
## 23 10  7.46       3
## 24  8  6.77       3
## 25 13 12.74       3
## 26  9  7.11       3
## 27 11  7.81       3
## 28 14  8.84       3
## 29  6  6.08       3
## 30  4  5.39       3
## 31 12  8.15       3
## 32  7  6.42       3
## 33  5  5.73       3
## 34  8  6.58       4
## 35  8  5.76       4
## 36  8  7.71       4
## 37  8  8.84       4
## 38  8  8.47       4
## 39  8  7.04       4
## 40  8  5.25       4
## 41 19 12.50       4
## 42  8  5.56       4
## 43  8  7.91       4
## 44  8  6.89       4
# Make scatterplots using the ggplot2 package
theme_set(theme_bw()) # set plot color theme

# create the four plots of Figure 3-7
ggplot(mydata, aes(x,y)) +
  geom_point(size=4) +
  geom_smooth(method="lm", fill=NA, fullrange=TRUE) +
  facet_wrap(~mygroup)

##########################################
# section 3.2.2 Dirty Data
##########################################

age <- rnorm(6000, mean=40, sd=10) 
age <- c( age, runif(20, min=-2, max=0), 
          rep(0,400), 
          runif(40, min=100, max=110))
age <- round(age)

hist(age, breaks=100, main="Age Distribution of Account Holders",
     xlab="Age", ylab="Frequency", col="gray")

x <- c(1, 2, 3, NA, 4)
is.na(x)
## [1] FALSE FALSE FALSE  TRUE FALSE
mean(x)
## [1] NA
mean(x, na.rm=TRUE)
## [1] 2.5
DF <- data.frame(x = c(1, 2, 3), y = c(10, 20, NA))
DF
##   x  y
## 1 1 10
## 2 2 20
## 3 3 NA
DF1 <- na.exclude(DF)
DF1
##   x  y
## 1 1 10
## 2 2 20
mortgage <- rbeta(2000,2,4) * 10
mortgage <- c( mortgage, rep(10, 1000))
hist(mortgage, breaks=10, xlab="Mortgage Age", col="gray",
     main="Portfolio Distribution, Years Since Origination")

##########################################
# section 3.2.3 Visualizing a Single Variable
##########################################
data(mtcars)

## Dotchart and Barplot ##

dotchart(mtcars$mpg,labels=row.names(mtcars),cex=.7,
         main="Miles Per Gallon (MPG) of Car Models",
         xlab="MPG")

barplot(table(mtcars$cyl), main="Distribution of Car Cylinder Counts",
        xlab="Number of Cylinders")

## Histogram and Density Plot ##

# randomly generate 4000 observations from the log normal distribution
income <- rlnorm(4000, meanlog = 4, sdlog = 0.7)
summary(income)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.003  33.762  54.096  69.274  87.139 567.525
income <- 1000*income
summary(income)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3003   33762   54096   69274   87139  567526
# plot the histogram
hist(income, breaks=500, xlab="Income", main="Histogram of Income")

# density plot
plot(density(log10(income), adjust=0.5),
     main="Distribution of Income (log10 scale)")
# add rug to the density plot
rug(log10(income))

library("ggplot2")

theme_set(theme_grey())

data(diamonds) # load the diamonds dataset from ggplot2
# Only keep the premium and ideal cuts of diamonds
niceDiamonds <- diamonds[diamonds$cut=="Premium" |
                           diamonds$cut=="Ideal",]

summary(niceDiamonds$cut)
##      Fair      Good Very Good   Premium     Ideal 
##         0         0         0     13791     21551
# plot density plot of diamond prices
ggplot(niceDiamonds, aes(x=price, fill=cut)) +
  geom_density(alpha = .3, color=NA)

# plot density plot of the log10 of diamond prices
ggplot(niceDiamonds, aes(x=log10(price), fill=cut)) +
  geom_density(alpha = .3, color=NA)

##########################################
# section 3.2.4 Examining Multiple Variables
##########################################

# 75 numbers between 0 and 10 of uniform distribution
x <- runif(75, 0, 10)
x <- sort(x)
y <- 200 + x^3 - 10 * x^2 + x + rnorm(75, 0, 20)
lr <- lm(y ~ x) # linear regression
poly <- loess(y ~ x) # LOESS
fit <- predict(poly) # fit a nonlinear line
plot(x,y)
# draw the fitted line for the linear regression
points(x, lr$coefficients[1] + lr$coefficients[2] * x,
       type = "l", col = 2)
# draw the fitted line with LOESS
points(x, fit, type = "l", col = 4)

## Dotchart and Barplot ##

# sort by mpg
cars <- mtcars[order(mtcars$mpg),]
# grouping variable must be a factor
cars$cyl <- factor(cars$cyl)
cars$color[cars$cyl==4] <- "red"
cars$color[cars$cyl==6] <- "blue"
cars$color[cars$cyl==8] <- "darkgreen"
dotchart(cars$mpg, labels=row.names(cars), cex=.7, groups= cars$cyl,
         main="Miles Per Gallon (MPG) of Car Models\nGrouped by Cylinder",
         xlab="Miles Per Gallon", color=cars$color, gcolor="black")

counts <- table(mtcars$gear, mtcars$cyl)
barplot(counts, main="Distribution of Car Cylinder Counts and Gears",
        xlab="Number of Cylinders", ylab="Counts",
        col=c("#0000FFFF", "#0080FFFF", "#00FFFFFF"),
        legend = rownames(counts), beside=TRUE,
        args.legend = list(x="top", title = "Number of Gears"))

## Box-and-Whisker Plot ##

DF <- read.csv("C:/Users/ryerrapati/Desktop/RAVI/UC/Academics/SEM 1/Big Data Analytics and Science/Week 2/zipIncome.csv", header=TRUE, sep=",")

# Remove outliers
DF <- subset(DF, DF$MeanHouseholdIncome > 7000 & DF$MeanHouseholdIncome < 200000) 
summary(DF)
##  MeanEducation   MeanHouseholdIncome      Zip1      
##  Min.   : 0.00   Min.   :  8465      Min.   :0.000  
##  1st Qu.:11.88   1st Qu.: 37755      1st Qu.:2.000  
##  Median :12.44   Median : 44234      Median :4.000  
##  Mean   :12.56   Mean   : 48465      Mean   :4.474  
##  3rd Qu.:13.11   3rd Qu.: 54444      3rd Qu.:7.000  
##  Max.   :19.00   Max.   :194135      Max.   :9.000
library(ggplot2)
# plot the jittered scatterplot w/ boxplot
# color-code points with zip codes
# the outlier.size=0 prevents the boxplot from plotting the outlier
ggplot(data=DF, aes(x=as.factor(Zip1), y=log10(MeanHouseholdIncome))) +
  geom_point(aes(color=factor(Zip1)), alpha=0.2, position="jitter") +
  geom_boxplot(outlier.size=0, alpha=0.1) +
  guides(colour=FALSE) +
  ggtitle ("Mean Household Income by Zip Code")

# simple boxplot
boxplot(log10(MeanHouseholdIncome) ~ Zip1, data=DF)
title ("Mean Household Income by Zip Code")

## Hexbinplot for Large Datasets ##

# plot the data points
plot(log10(MeanHouseholdIncome) ~ MeanEducation, data=DF)
# add a straight fitted line of the linear regression
abline(lm(log10(MeanHouseholdIncome) ~ MeanEducation, data=DF),
       col='red')

#install.packages("hexbin")
library(hexbin)
## Warning: package 'hexbin' was built under R version 3.5.2
#
# "g" adds the grid, "r" adds the regression line
# sqrt transform on the count gives more dynamic range to the shading
# inv provides the inverse transformation function of trans
#
hexbinplot(log10(MeanHouseholdIncome) ~ MeanEducation,
           data=DF, trans = sqrt, inv = function(x) x^2,
           type=c("g", "r"))

## Scatterplot Matrix ##

# define the colors
colors <- c("red", "green", "blue")
#colors <- c("gray50", "white", "black")

# draw the plot matrix
pairs(iris[1:4], main = "Fisher's Iris Dataset",
      pch = 21, bg = colors[unclass(iris$Species)] )
# set graphical parameter to clip plotting to the figure region
par(xpd = TRUE)
# add legend
legend(0.2, 0.02, horiz = TRUE, as.vector(unique(iris$Species)),
       fill = colors, bty = "n")

## Analyzing a Variable over Time ##

plot(AirPassengers)

##########################################
# section 3.2.5 Data Exploration Versus Presentation
##########################################

# Generate random log normal income data
income = rlnorm(5000, meanlog=log(40000), sdlog=log(5))
# Part I: Create the density plot
plot(density(log10(income), adjust=0.5),
     main="Distribution of Account Values (log10 scale)")
# Add rug to the density plot
rug(log10(income))

# Part II: Make the histogram
# Create "log-like bins"
breaks = c(0, 1000, 5000, 10000, 50000, 100000, 5e5, 1e6, 2e7)
# Create bins and label the data
bins = cut(income, breaks, include.lowest=T,
           labels = c("< 1K", "1-5K", "5-10K", "10-50K",
                      "50-100K", "100-500K", "500K-1M", "> 1M"))
# Plot the bins
plot(bins, main = "Distribution of Account Values",
     xlab = "Account value ($ USD)",
     ylab = "Number of Accounts", col="blue")