Readings
- Wickham Chapter 7- Exploratory Analysis
- Wickham Chapter 3 - Data Visualization
- Wickham Chapter 27 - R Markdown
- Wickham Chapter 28 - Graphics for Communication
str() - Displays preview of internal structure of object. First few observations, variable name, Variable typeclass() - What kind of object is it? (high-level)typeof() - What is the object’s data type (low-level)length() - How long is it?attributes() - Does it have any metadata?Numb.Kids <- c(1, 2, 3, 4, 5, 6, 7)
Numb.Fam1 <- c(100, 200, 300, 200, 100, 50, 50 )
Numb.Fam2 <- c(500, 200, 10, 10, 10, 300, 100)
Tot.Kids1 <- Numb.Kids * Numb.Fam1
Tot.Kids2 <- Numb.Kids * Numb.Fam2
City1 <- data.frame(Numb.Kids, Numb.Fam1, Tot.Kids1)
City1
## Numb.Kids Numb.Fam1 Tot.Kids1
## 1 1 100 100
## 2 2 200 400
## 3 3 300 900
## 4 4 200 800
## 5 5 100 500
## 6 6 50 300
## 7 7 50 350
City2 <- data.frame(Numb.Kids, Numb.Fam2, Tot.Kids2)
City2
## Numb.Kids Numb.Fam2 Tot.Kids2
## 1 1 500 500
## 2 2 200 400
## 3 3 10 30
## 4 4 10 40
## 5 5 10 50
## 6 6 300 1800
## 7 7 100 700
Cities <- data.frame(Numb.Kids, Numb.Fam1, Numb.Fam2, Tot.Kids1, Tot.Kids2)
Cities
## Numb.Kids Numb.Fam1 Numb.Fam2 Tot.Kids1 Tot.Kids2
## 1 1 100 500 100 500
## 2 2 200 200 400 400
## 3 3 300 10 900 30
## 4 4 200 10 800 40
## 5 5 100 10 500 50
## 6 6 50 300 300 1800
## 7 7 50 100 350 700
Note: grid_arrange is from the gridExtra() package.
City1Plot <- City1 %>% ggplot(aes(x = Numb.Kids, y = Numb.Fam1)) +
geom_col() +
labs(subtitle = "City 1") +
ylim(0,500) +
theme(axis.title = element_blank())
City2Plot <- City2 %>% ggplot(aes(x = Numb.Kids, y = Numb.Fam2)) +
geom_col() +
labs(subtitle = "City 2") +
theme(axis.title = element_blank())
grid.arrange(City1Plot, City2Plot, nrow=1, top = "Distribution of Family Size in Two Cities", bottom = "Number of Kids", left = "Number of Families")
library(psych)
#describe(City1)
#describe(City2)
describe(Cities) # from the psych() package
## vars n mean sd median trimmed mad min max range skew
## Numb.Kids 1 7 4.00 2.16 4 4.00 2.97 1 7 6 0.00
## Numb.Fam1 2 7 142.86 93.22 100 142.86 74.13 50 300 250 0.44
## Numb.Fam2 3 7 161.43 186.14 100 161.43 133.43 10 500 490 0.68
## Tot.Kids1 4 7 478.57 282.63 400 478.57 148.26 100 900 800 0.29
## Tot.Kids2 5 7 502.86 629.15 400 502.86 518.91 30 1800 1770 1.08
## kurtosis se
## Numb.Kids -1.71 0.82
## Numb.Fam1 -1.51 35.23
## Numb.Fam2 -1.20 70.35
## Tot.Kids1 -1.57 106.82
## Tot.Kids2 -0.29 237.79
#describeBy() # Summarizes by Grouping Variable
Both have roughly the same number of families (1000 vs. 1130) and roughly the same number of kids (3350 vs. 3520).
They also both have similar average kids per family:
If the distribution of the data is spread out, what does that imply?
Why would this matter?
Box1 <- ggplot(Cities, aes(x = Tot.Kids1) ) + geom_boxplot(fill= "blue") + xlim(0, 2000) + labs(title = 'City 1')
Box2 <- ggplot(Cities, aes(x = Tot.Kids2)) + geom_boxplot(fill = "purple")+ xlim(0, 2000) + labs(title = 'City 2')
grid.arrange(Box1, Box2, nrow=1)
Box3 <- ggplot(Cities, aes(x = Numb.Fam1) ) + geom_boxplot(fill= "blue") + xlim(0, 600) + labs(title = 'City 1')
Box3
Box4 <- ggplot(Cities, aes(x = Numb.Fam2)) + geom_boxplot(fill = "purple")+ xlim(0, 600) + labs(title = 'City 2')
grid.arrange(Box3, Box4, nrow=1)
grid.arrange(City1Plot, City2Plot, Box1, Box2, nrow=2, ncol = 2)
set.seed(1234)
x <- c(rnorm(500, mean = -1), rnorm(500, mean = 1.5))
y <- c(rnorm(500, mean = 1), rnorm(500, mean = 1.7))
group <- as.factor(rep(c(1,2), each=500))
df <- data.frame(x, y, group)
head(df)
## x y group
## 1 -2.20706575 -0.2053334 1
## 2 -0.72257076 1.3014667 1
## 3 0.08444118 -0.5391452 1
## 4 -3.34569770 1.6353707 1
## 5 -0.57087531 1.7029518 1
## 6 -0.49394411 -0.9058829 1
# scatter plot of x and y variables
# color by groups
scatterPlot <- ggplot(df,aes(x, y, color=group)) +
geom_point() +
scale_color_manual(values = c('#999999','#E69F00')) +
theme(legend.position=c(0,1), legend.justification=c(0,1))
scatterPlot
# Marginal density plot of x (top panel)
xdensity <- ggplot(df, aes(x, fill=group)) +
geom_density(alpha=.5) +
scale_fill_manual(values = c('#999999','#E69F00')) +
theme(legend.position = "none")
xdensity
# Marginal density plot of y (right panel)
ydensity <- ggplot(df, aes(y, fill=group)) +
geom_density(alpha=.5) +
scale_fill_manual(values = c('#999999','#E69F00')) +
theme(legend.position = "none")
ydensity
blankPlot <- ggplot()+geom_blank(aes(1,1))+
theme(plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank()
)
#install.packages("gridExtra)
library("gridExtra")
grid.arrange(xdensity, blankPlot, scatterPlot, ydensity,
ncol=2, nrow=2, widths=c(4, 1.4), heights=c(1.4, 4))
# Load
# install.packages("ggExtra")
library("ggExtra")
## Warning: package 'ggExtra' was built under R version 4.0.5
# Create some data
set.seed(1234)
x <- c(rnorm(500, mean = -1), rnorm(500, mean = 1.5))
y <- c(rnorm(500, mean = 1), rnorm(500, mean = 1.7))
df3 <- data.frame(x, y)
# Scatter plot of x and y variables and color by groups
sp2 <- ggplot(df3,aes(x, y)) + geom_point()
# Marginal density plot
ggMarginal(sp2 + theme_gray())
# Marginal histogram plot
ggMarginal(sp2 + theme_gray(), type = "histogram",
fill = "steelblue", col = "darkblue")
# Create data
my_variable=c(rnorm(1000 , 0 , 2) , rnorm(1000 , 9 , 2))
# Layout to split the screen
layout(mat = matrix(c(1,2),2,1, byrow=TRUE), height = c(1,8))
# Draw the boxplot and the histogram
par(mar=c(0, 3.1, 1.1, 2.1))
boxplot(my_variable , horizontal=TRUE , ylim=c(-10,20), xaxt="n" , col=rgb(0.8,0.8,0,0.5) , frame=F)
par(mar=c(4, 3.1, 1.1, 2.1))
hist(my_variable , breaks=40 , col=rgb(0.2,0.8,0.5,0.5) , border=F , main="" , xlab="value of the variable", xlim=c(-10,20))
Bivariate data is used to find out if there is a relationship between two different variables. It is frequently represented with scatter plots where one variable is on the X axis and the other is on the Y axis. If the data seems to fit a line or curve then there may be a relationship, or correlation, between the two variables. Always be careful when examining relationships. Many variables may appear related when in fact their relationship happened by chance or a third variable is influencing both variables.
Test of Association: Used to test whether two variables are related to one another or not
- Depends on the nature of the variables you are studying (nominal, ordinal, interval) and number of categories
- Depends on the nature of the question you’re answering (independence, agreement of coders, effect of an intervention, etc.)
Questions to ask yourself:
“Variation creates uncertainty but covariation reduces it.” - pg 106
| Content warning: Murder |
This recreates and expands on the example in the book Practical Statistics for Data Scientists: 50 Essentail Concepts by Peter Bruce & Andrew Bruce (one of the assigned readings)
getwd() # check my working directory to make sure that R can find my file without the full file path
## [1] "C:/Users/aleaw/OneDrive - University of Illinois at Chicago/Week 11"
state <- read_csv("Bruce&Bruce_murder.csv") # bring the CSV into R as a tibble
##
## -- Column specification --------------------------------------------------------
## cols(
## State = col_character(),
## Population = col_double(),
## Murder.Rate = col_double(),
## Abbreviation = col_character()
## )
names(state) # tells me names of variables
## [1] "State" "Population" "Murder.Rate" "Abbreviation"
str(state) # shows preview of dataset structure
## spec_tbl_df [50 x 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ State : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ Population : num [1:50] 4779736 710231 6392017 2915918 37253956 ...
## $ Murder.Rate : num [1:50] 5.7 5.6 4.7 5.6 4.4 2.8 2.4 5.8 5.8 5.7 ...
## $ Abbreviation: chr [1:50] "AL" "AK" "AZ" "AR" ...
## - attr(*, "spec")=
## .. cols(
## .. State = col_character(),
## .. Population = col_double(),
## .. Murder.Rate = col_double(),
## .. Abbreviation = col_character()
## .. )
summary(state) # calculates statistics for all numeric variables
## State Population Murder.Rate Abbreviation
## Length:50 Min. : 563626 Min. : 0.900 Length:50
## Class :character 1st Qu.: 1833004 1st Qu.: 2.425 Class :character
## Mode :character Median : 4436370 Median : 4.000 Mode :character
## Mean : 6162876 Mean : 4.066
## 3rd Qu.: 6680312 3rd Qu.: 5.550
## Max. :37253956 Max. :10.300
Calculate the mean and median population for the 50 states using the mean() and median() commands.
Remember what the mean actually is: The sum of all of populations for all 50 states, divided by the number of states. We can do this manually with this code below:
# adds up total population
totalpopulation <- sum(state$Population)
# counts how many observations there are within the Population variable (50 for the 50 states)
numberofstates <- length(state$Population)
# calculate the average population across the 50 states
totalpopulation / numberofstates
## [1] 6162876
Could also look like this:
sum(state$Population)/length(state$Population)
## [1] 6162876
Or we could just use the built in functions that do these steps for us:
mean(state$Population)
## [1] 6162876
The data set includes the State, the Population, and the Murder Rate (in units of murders per 100,000 people). If we wanted to calculate the average number of murders within the US, it may be tempting to calculate it the same was as we calculated the average population:
sum(state$Murder.Rate)/length(state$Murder.Rate)
## [1] 4.066
Put this number back into the context of the variable. What does it mean?
BUT remember, the Murder.Rate variable was already adjusted for population size (murders per 100,000 people per year).
We could calculate a weighted mean. This means we would would have to consider the population size while calculating the mean murder rate. There is even a command for that: weighted.mean() from the stats() package.
# install.packages("stats")
# library(stats)
# weighted.mean(thing you want the mean of, thing to use for weights)
weighted.mean(state$Murder.Rate, state$Population)
## [1] 4.445834
Now the average murder rate in the United States, when considering population size, is 4.4 per 100,000 people.
Create a variable for the number of murders within each state:
# Creates a new variable named "Count" within the State dataset
state <- state %>%
mutate(Count = Population/100000*Murder.Rate)
breaks <- seq(from=min(state[["Population"]]),
to=max(state[["Population"]]), length=11)
pop_freq <- cut(state[["Population"]], breaks=breaks,
right=TRUE, include.lowest = TRUE)
table(pop_freq)
breaks <- seq(from = min(state$Population), to = max(state$Population), length = 11)
pop_freq <- cut(state$Population, breaks = breaks,
right = TRUE, include.lowest = TRUE)
table(pop_freq)
## pop_freq
## [5.64e+05,4.23e+06] (4.23e+06,7.9e+06] (7.9e+06,1.16e+07] (1.16e+07,1.52e+07]
## 24 14 6 2
## (1.52e+07,1.89e+07] (1.89e+07,2.26e+07] (2.26e+07,2.62e+07] (2.62e+07,2.99e+07]
## 1 1 1 0
## (2.99e+07,3.36e+07] (3.36e+07,3.73e+07]
## 0 1
Reminder: A histogram is a wayto visualize the frequency table, with bins on the x-axis and data count on the y-axis.
hist(state$Population) # defaulted to 8 bins
hist(state$Population, breaks = breaks) # now has 10 bins,
hist(state$Murder.Rate, freq = FALSE) # now has 10 bins,
lines(density(state$Murder.Rate), lwd = 3, col="blue")
Key Ideas + A frequency histogram plots frequency counts on the y-axis and variable values on the x-axis; it gives a sense of the distribution of the data at a glance.
A frequency table is a tabular version of the frequency counts found in a histogram.
A boxplot—with the top and bottom of the box at the 75th and 25th percentiles, respectively—also gives a quick sense of the distribution of the data; it is often used in side-by-side displays to compare distributions.
A density plot is a smoothed version of a histogram; it requires a function to estimate a plot based on the data (multiple estimates are possible, of course).
Remember, standard deviation is easier to interpret than variance because it is on the same scale as the original data. Standard Deviation is sensitive to outliers
sd(state$Population)
## [1] 6848235
IQR(state$Population)
## [1] 4847308
# ?quantile # what are the potential options for the quantile command?
quantile(____): quantile(x, probs = seq(0, 1, 0.25), na.rm = FALSE, names = TRUE, type = 7, ...)
The default quantiles are to calculate the 0% (min), 25%, 50% (median), 75%, and 100% (max)
quantile(state$Population)
## 0% 25% 50% 75% 100%
## 563626 1833004 4436370 6680312 37253956
# change the qunatiles to calculate the bottom 5% and top 95% instead of 25% and 75%
quantile(state$Population, c(0.05, .25, .5, .95, .95))
## 5% 25% 50% 95% 95%
## 689529 1833004 4436370 19118546 19118546
boxplot() is from the graphics() package. Top and bottom of the box are the 75th and 25th percnetiles. Median is horizontal line. The dashed lines are the whiskers and indicate the range for the majority of the data. For this command, the whiskers extend to the farthest point beyond the box up until the 1.5 times the interquartile range (IQR). The IQR of the population was 4.847308^{6} so the whiskers go until + or - 1.5(4.847308^{6}), or +/- 7.270962^{6}.
boxplot(state$Population/1000000,
ylab = "Population (millions")
> Boxplots are useful summaries, but hide the shape of the distribution. If there was a bimodal distribution, it would not be visible with a boxplot. Another graphing alternative to the boxplot is a violin plot, where the shape (of the density of points) is drawn. Use
geom_violin() for violin plots.
state %>% ggplot(aes(x=Population)) +
geom_boxplot() # boxplot
state %>% ggplot(aes(x=Population)) +
geom_density() # smoothed distribution
state %>% ggplot(aes(y=Population)) +
geom_boxplot()
p <- state %>% ggplot(aes(y=Population)) +
geom_boxplot()
p # view the basic boxplot
ggplot(state, aes( x = Population, y = Population)) +
geom_boxplot() +
#coord_flip() +
geom_jitter(shape = 16, position = position_jitter(0.2)) +
stat_summary(fun.y = mean, geom = "point", shape=23, size = 4)
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
### Correlation
Correlation is the extent that two continuous (interval or ratio level) variables are related. A relationship exists when knowing the value of one variavble is useful for predicting the value of a second variable.
Correlation Coefficient: symmetric, scale-invariant measure of association between two variables
- Ranges from -1 to +1.
- Strength of Correlation: 0 means no correlation, -1 and +1 imply perfect correlation (either one increases as the other decreases [-1] or they move in the same direction[+1])
cor(variable1, variable2, use = "complete.obs")
cor.test(v1, v2)
cor.test(v1, v2, method = "spearman") # Spearman's p and Kendall for nonparametric
corcoef <- cor(state$Population, state$Murder.Rate)
corcoef
## [1] 0.1820693
coef_text <- paste("Correlation Coefficient:", round(cor(state$Population, state$Murder.Rate), digits = 3))
coef_text
## [1] "Correlation Coefficient: 0.182"
Interpretation?
Link with tons of examples of ways to customize scatter plots.
ggplot(state, aes(Population, Murder.Rate, label= Abbreviation)) +
geom_point() +
geom_smooth(method=lm, se=FALSE) +
annotate(x = 20000000, y = 8.5, geom = "text", label = coef_text) +
theme(panel.background = element_blank())
## `geom_smooth()` using formula 'y ~ x'
In R, functions accept objects as inputs, manipulate the inputs in some way, and return some output. For example, the function mean(object) would return the mean of an object (assuming the object was a list of numbers). The function c() is called the Combine Function and will combine a list of numbers (or words) into a new object.
## 'rain' contains actual rainfall data for Boulder, CO (2000-2011)
rain <- c(16, 18, 14, 22, 27, 17, 19, 17, 17, 22, 20, 22)
The object “rain” contains data, we can calculate some descriptive statistics:
mean(rain) #returns the average rainfall from 2000-2011 in Boulder, CO
## [1] 19.25
sum(rain) #returns the total amount of rainfall during the study period
## [1] 231
length(rain) #returns the length of the list, i.e. the number of years of data
## [1] 12
We can also calculate deviations from the mean for each year:
rain - mean(rain) #Deviations from the mean; negative values indicate below average rainfall.
## [1] -3.25 -1.25 -5.25 2.75 7.75 -2.25 -0.25 -2.25 -2.25 2.75 0.75 2.75
We can use the assignment operator to save these deviations from the mean as a new object:
rainDeviations <- rain - mean(rain)
rainDeviations^2 #Squared deviations from the mean
## [1] 10.5625 1.5625 27.5625 7.5625 60.0625 5.0625 0.0625 5.0625 5.0625
## [10] 7.5625 0.5625 7.5625
sqrt(rain) #Square root of rainfall values
## [1] 4.000000 4.242641 3.741657 4.690416 5.196152 4.123106 4.358899 4.123106
## [9] 4.123106 4.690416 4.472136 4.690416
Conceptually, the standard deviation is like the average deviation from the mean. However, the average deviation from the mean is always zero. Thus, we calculate the standard deviation as:
\[s = \sqrt{s^{2}} = \sqrt{\frac{SS}{N - 1}} = \sqrt{\frac{\sum (x_{i} - \bar{x})^{2}}{N - 1}}\] where \(s^2\) is the svariance; SS is the sum of squared errors; N is the number of observations; \(x_i\) is the \(i^{th}\) score in a group; and \(\bar{x}\) is the mean of the group.
The standard deviation is the Root Mean Square (RMS) of the deviations from the mean. The above formula can be broken down into a series of simple steps:
1. Calculate the deviations from the mean (see above R code).
2. Square the deviations from the mean, save the squared deviations as a new R object (use the “<-” assignment operator).
3. Take the mean of these squared deviations. Again, save the results as an object.
4. Finally, take the square root of the result from the prior step.
# Factor
#install.packages(dataset)
library(datasets)
data("mtcars")
mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
str(mtcars) # see the structure of the data
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
tab <- table(mtcars$cyl)
tab
##
## 4 6 8
## 11 7 14
prop.table(tab)
##
## 4 6 8
## 0.34375 0.21875 0.43750
summary(mtcars[,c("cyl" , "vs" , "am" , "gear" , "carb")])
## cyl vs am gear
## Min. :4.000 Min. :0.0000 Min. :0.0000 Min. :3.000
## 1st Qu.:4.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:3.000
## Median :6.000 Median :0.0000 Median :0.0000 Median :4.000
## Mean :6.188 Mean :0.4375 Mean :0.4062 Mean :3.688
## 3rd Qu.:8.000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:4.000
## Max. :8.000 Max. :1.0000 Max. :1.0000 Max. :5.000
## carb
## Min. :1.000
## 1st Qu.:2.000
## Median :2.000
## Mean :2.812
## 3rd Qu.:4.000
## Max. :8.000
# Numeric
mean(mtcars$mpg)
## [1] 20.09062
median(mtcars$mpg)
## [1] 19.2
min(mtcars$mpg)
## [1] 10.4
max(mtcars$mpg)
## [1] 33.9
range(mtcars$mpg)
## [1] 10.4 33.9
quantile(mtcars$mpg)
## 0% 25% 50% 75% 100%
## 10.400 15.425 19.200 22.800 33.900
summary(mtcars$mpg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.40 15.43 19.20 20.09 22.80 33.90
# Two Numeric
data(mtcars)
View(mtcars)
cor(mtcars$mpg , mtcars$hp)
## [1] -0.7761684
cor(mtcars$disp , mtcars$hp)
## [1] 0.7909486
# One Factor & One Numeric
mtcars %>% group_by(cyl) %>% summarise(avg=mean(mpg),
median=median(mpg),
std=sd(mpg))
## # A tibble: 3 x 4
## cyl avg median std
## <dbl> <dbl> <dbl> <dbl>
## 1 4 26.7 26 4.51
## 2 6 19.7 19.7 1.45
## 3 8 15.1 15.2 2.56