Week 11: Central Tendency and Descriptive Statistics in R

Readings
- Wickham Chapter 7- Exploratory Analysis
- Wickham Chapter 3 - Data Visualization
- Wickham Chapter 27 - R Markdown
- Wickham Chapter 28 - Graphics for Communication

Looking at Data

  • str() - Displays preview of internal structure of object. First few observations, variable name, Variable type
  • class() - What kind of object is it? (high-level)
  • typeof() - What is the object’s data type (low-level)
  • length() - How long is it?
  • attributes() - Does it have any metadata?

Housing Distributions

Numb.Kids <- c(1, 2, 3, 4, 5, 6, 7)

Numb.Fam1 <- c(100, 200, 300, 200, 100, 50, 50 )

Numb.Fam2 <- c(500, 200, 10, 10, 10, 300, 100)

Tot.Kids1 <- Numb.Kids * Numb.Fam1

Tot.Kids2 <- Numb.Kids * Numb.Fam2

City1 <- data.frame(Numb.Kids, Numb.Fam1, Tot.Kids1)
City1
##   Numb.Kids Numb.Fam1 Tot.Kids1
## 1         1       100       100
## 2         2       200       400
## 3         3       300       900
## 4         4       200       800
## 5         5       100       500
## 6         6        50       300
## 7         7        50       350
City2 <- data.frame(Numb.Kids, Numb.Fam2, Tot.Kids2)
City2
##   Numb.Kids Numb.Fam2 Tot.Kids2
## 1         1       500       500
## 2         2       200       400
## 3         3        10        30
## 4         4        10        40
## 5         5        10        50
## 6         6       300      1800
## 7         7       100       700
Cities <- data.frame(Numb.Kids, Numb.Fam1, Numb.Fam2, Tot.Kids1, Tot.Kids2)
Cities
##   Numb.Kids Numb.Fam1 Numb.Fam2 Tot.Kids1 Tot.Kids2
## 1         1       100       500       100       500
## 2         2       200       200       400       400
## 3         3       300        10       900        30
## 4         4       200        10       800        40
## 5         5       100        10       500        50
## 6         6        50       300       300      1800
## 7         7        50       100       350       700

Note: grid_arrange is from the gridExtra() package.

City1Plot <- City1 %>% ggplot(aes(x = Numb.Kids, y = Numb.Fam1)) +
 geom_col() + 
  labs(subtitle = "City 1") +
  ylim(0,500) +
    theme(axis.title = element_blank()) 


City2Plot <- City2 %>% ggplot(aes(x = Numb.Kids, y = Numb.Fam2)) +
  geom_col() + 
  labs(subtitle = "City 2") +
  theme(axis.title = element_blank()) 
    


grid.arrange(City1Plot, City2Plot, nrow=1, top = "Distribution of Family Size in Two Cities", bottom = "Number of Kids", left = "Number of Families")

library(psych) 
#describe(City1)
#describe(City2)
describe(Cities)   # from the psych() package
##           vars n   mean     sd median trimmed    mad min  max range skew
## Numb.Kids    1 7   4.00   2.16      4    4.00   2.97   1    7     6 0.00
## Numb.Fam1    2 7 142.86  93.22    100  142.86  74.13  50  300   250 0.44
## Numb.Fam2    3 7 161.43 186.14    100  161.43 133.43  10  500   490 0.68
## Tot.Kids1    4 7 478.57 282.63    400  478.57 148.26 100  900   800 0.29
## Tot.Kids2    5 7 502.86 629.15    400  502.86 518.91  30 1800  1770 1.08
##           kurtosis     se
## Numb.Kids    -1.71   0.82
## Numb.Fam1    -1.51  35.23
## Numb.Fam2    -1.20  70.35
## Tot.Kids1    -1.57 106.82
## Tot.Kids2    -0.29 237.79
#describeBy()      # Summarizes by Grouping Variable

Both have roughly the same number of families (1000 vs. 1130) and roughly the same number of kids (3350 vs. 3520).

They also both have similar average kids per family:

  • City 1: average number of kids per family = 3.35
  • City 2: Average number of kids per family = 3.1150442

If the distribution of the data is spread out, what does that imply?

Why would this matter?

Box1 <- ggplot(Cities, aes(x = Tot.Kids1) ) + geom_boxplot(fill= "blue") +  xlim(0, 2000) + labs(title = 'City 1')
Box2 <- ggplot(Cities, aes(x = Tot.Kids2)) + geom_boxplot(fill = "purple")+  xlim(0, 2000) + labs(title = 'City 2') 

grid.arrange(Box1, Box2, nrow=1)

Box3 <- ggplot(Cities, aes(x = Numb.Fam1) ) + geom_boxplot(fill= "blue") +  xlim(0, 600) + labs(title = 'City 1')
Box3

Box4 <- ggplot(Cities, aes(x = Numb.Fam2)) + geom_boxplot(fill = "purple")+  xlim(0, 600) + labs(title = 'City 2') 

grid.arrange(Box3, Box4, nrow=1)

grid.arrange(City1Plot, City2Plot, Box1, Box2, nrow=2, ncol = 2)

Scatterplot with Marginal Density Distribution

set.seed(1234)
x <- c(rnorm(500, mean = -1), rnorm(500, mean = 1.5))
y <- c(rnorm(500, mean = 1), rnorm(500, mean = 1.7))
group <- as.factor(rep(c(1,2), each=500))
df <- data.frame(x, y, group)
head(df)
##             x          y group
## 1 -2.20706575 -0.2053334     1
## 2 -0.72257076  1.3014667     1
## 3  0.08444118 -0.5391452     1
## 4 -3.34569770  1.6353707     1
## 5 -0.57087531  1.7029518     1
## 6 -0.49394411 -0.9058829     1
# scatter plot of x and y variables
# color by groups
scatterPlot <- ggplot(df,aes(x, y, color=group)) + 
  geom_point() + 
  scale_color_manual(values = c('#999999','#E69F00')) + 
  theme(legend.position=c(0,1), legend.justification=c(0,1))

scatterPlot

# Marginal density plot of x (top panel)
xdensity <- ggplot(df, aes(x, fill=group)) + 
  geom_density(alpha=.5) + 
  scale_fill_manual(values = c('#999999','#E69F00')) + 
  theme(legend.position = "none")
xdensity

# Marginal density plot of y (right panel)
ydensity <- ggplot(df, aes(y, fill=group)) + 
  geom_density(alpha=.5) + 
  scale_fill_manual(values = c('#999999','#E69F00')) + 
  theme(legend.position = "none")
ydensity

blankPlot <- ggplot()+geom_blank(aes(1,1))+
  theme(plot.background = element_blank(), 
   panel.grid.major = element_blank(),
   panel.grid.minor = element_blank(), 
   panel.border = element_blank(),
   panel.background = element_blank(),
   axis.title.x = element_blank(),
   axis.title.y = element_blank(),
   axis.text.x = element_blank(), 
   axis.text.y = element_blank(),
   axis.ticks = element_blank()
     )

#install.packages("gridExtra)
library("gridExtra")
grid.arrange(xdensity, blankPlot, scatterPlot, ydensity, 
        ncol=2, nrow=2, widths=c(4, 1.4), heights=c(1.4, 4))

# Load
# install.packages("ggExtra")
library("ggExtra")
## Warning: package 'ggExtra' was built under R version 4.0.5
# Create some data
set.seed(1234)
x <- c(rnorm(500, mean = -1), rnorm(500, mean = 1.5))
y <- c(rnorm(500, mean = 1), rnorm(500, mean = 1.7))
df3 <- data.frame(x, y)
# Scatter plot of x and y variables and color by groups
sp2 <- ggplot(df3,aes(x, y)) + geom_point()
# Marginal density plot
ggMarginal(sp2 + theme_gray())

# Marginal histogram plot
ggMarginal(sp2 + theme_gray(), type = "histogram",
           fill = "steelblue", col = "darkblue")

# Create data 
my_variable=c(rnorm(1000 , 0 , 2) , rnorm(1000 , 9 , 2))
 
# Layout to split the screen
layout(mat = matrix(c(1,2),2,1, byrow=TRUE),  height = c(1,8))
 
# Draw the boxplot and the histogram 
par(mar=c(0, 3.1, 1.1, 2.1))
boxplot(my_variable , horizontal=TRUE , ylim=c(-10,20), xaxt="n" , col=rgb(0.8,0.8,0,0.5) , frame=F)
par(mar=c(4, 3.1, 1.1, 2.1))
hist(my_variable , breaks=40 , col=rgb(0.2,0.8,0.5,0.5) , border=F , main="" , xlab="value of the variable", xlim=c(-10,20))

Graphing Bivariate Data

Bivariate data is used to find out if there is a relationship between two different variables. It is frequently represented with scatter plots where one variable is on the X axis and the other is on the Y axis. If the data seems to fit a line or curve then there may be a relationship, or correlation, between the two variables. Always be careful when examining relationships. Many variables may appear related when in fact their relationship happened by chance or a third variable is influencing both variables.

Scatterplots

Test of Association: Used to test whether two variables are related to one another or not
- Depends on the nature of the variables you are studying (nominal, ordinal, interval) and number of categories
- Depends on the nature of the question you’re answering (independence, agreement of coders, effect of an intervention, etc.)

Questions to ask yourself:

  • Could this pattern happen by chance?
  • How do you describe the relationship implied by the pattern?
  • How strong is the relationship implied by the pattern?
  • What other variables might affect the relationship?
  • Does the relationship change if you look at individual subgroups of the data?

“Variation creates uncertainty but covariation reduces it.” - pg 106

Bruce & Bruce Textbook Example

Content warning: Murder

This recreates and expands on the example in the book Practical Statistics for Data Scientists: 50 Essentail Concepts by Peter Bruce & Andrew Bruce (one of the assigned readings)

getwd() # check my working directory to make sure that R can find my file without the full file path
## [1] "C:/Users/aleaw/OneDrive - University of Illinois at Chicago/Week 11"
state <- read_csv("Bruce&Bruce_murder.csv") # bring the CSV into R as a tibble
## 
## -- Column specification --------------------------------------------------------
## cols(
##   State = col_character(),
##   Population = col_double(),
##   Murder.Rate = col_double(),
##   Abbreviation = col_character()
## )
names(state)      # tells me names of variables
## [1] "State"        "Population"   "Murder.Rate"  "Abbreviation"
str(state)        # shows preview of dataset structure
## spec_tbl_df [50 x 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ State       : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ Population  : num [1:50] 4779736 710231 6392017 2915918 37253956 ...
##  $ Murder.Rate : num [1:50] 5.7 5.6 4.7 5.6 4.4 2.8 2.4 5.8 5.8 5.7 ...
##  $ Abbreviation: chr [1:50] "AL" "AK" "AZ" "AR" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   State = col_character(),
##   ..   Population = col_double(),
##   ..   Murder.Rate = col_double(),
##   ..   Abbreviation = col_character()
##   .. )
summary(state)    # calculates statistics for all numeric variables
##     State             Population        Murder.Rate     Abbreviation      
##  Length:50          Min.   :  563626   Min.   : 0.900   Length:50         
##  Class :character   1st Qu.: 1833004   1st Qu.: 2.425   Class :character  
##  Mode  :character   Median : 4436370   Median : 4.000   Mode  :character  
##                     Mean   : 6162876   Mean   : 4.066                     
##                     3rd Qu.: 6680312   3rd Qu.: 5.550                     
##                     Max.   :37253956   Max.   :10.300

Calculate the mean and median population for the 50 states using the mean() and median() commands.

Remember what the mean actually is: The sum of all of populations for all 50 states, divided by the number of states. We can do this manually with this code below:

# adds up total population
totalpopulation <- sum(state$Population)

# counts how many observations there are within the Population variable (50 for the 50 states)
numberofstates <- length(state$Population) 

# calculate the average population across the 50 states
totalpopulation / numberofstates
## [1] 6162876

Could also look like this:

sum(state$Population)/length(state$Population)
## [1] 6162876

Or we could just use the built in functions that do these steps for us:

mean(state$Population)
## [1] 6162876

Calculating Murder Rates

The data set includes the State, the Population, and the Murder Rate (in units of murders per 100,000 people). If we wanted to calculate the average number of murders within the US, it may be tempting to calculate it the same was as we calculated the average population:

sum(state$Murder.Rate)/length(state$Murder.Rate)
## [1] 4.066

Put this number back into the context of the variable. What does it mean?

BUT remember, the Murder.Rate variable was already adjusted for population size (murders per 100,000 people per year).

We could calculate a weighted mean. This means we would would have to consider the population size while calculating the mean murder rate. There is even a command for that: weighted.mean() from the stats() package.

# install.packages("stats")
# library(stats)

# weighted.mean(thing you want the mean of, thing to use for weights)
weighted.mean(state$Murder.Rate, state$Population)
## [1] 4.445834

Now the average murder rate in the United States, when considering population size, is 4.4 per 100,000 people.

Create a variable for the number of murders within each state:

# Creates a new variable named "Count" within the State dataset
state <- state %>% 
  mutate(Count = Population/100000*Murder.Rate)

Frequency Table and Histograms

breaks <- seq(from=min(state[["Population"]]),
                to=max(state[["Population"]]), length=11)
pop_freq <- cut(state[["Population"]], breaks=breaks,
                right=TRUE, include.lowest = TRUE)
table(pop_freq)
breaks <- seq(from = min(state$Population), to = max(state$Population), length = 11)

pop_freq <- cut(state$Population, breaks = breaks,
                right = TRUE, include.lowest = TRUE)
table(pop_freq)
## pop_freq
## [5.64e+05,4.23e+06]  (4.23e+06,7.9e+06]  (7.9e+06,1.16e+07] (1.16e+07,1.52e+07] 
##                  24                  14                   6                   2 
## (1.52e+07,1.89e+07] (1.89e+07,2.26e+07] (2.26e+07,2.62e+07] (2.62e+07,2.99e+07] 
##                   1                   1                   1                   0 
## (2.99e+07,3.36e+07] (3.36e+07,3.73e+07] 
##                   0                   1

Reminder: A histogram is a wayto visualize the frequency table, with bins on the x-axis and data count on the y-axis.

  • Empty bins should be included in the graph! Otherwise you skew the axis
  • Bins are equal width
  • Number of bins / bin size is up to the user
  • Bars are contiguous - no empty space between bars! (unless there is an empty bin)
hist(state$Population) # defaulted to 8 bins

hist(state$Population, breaks = breaks) # now has 10 bins,

hist(state$Murder.Rate, freq = FALSE) # now has 10 bins,
lines(density(state$Murder.Rate), lwd = 3, col="blue")

Key Ideas + A frequency histogram plots frequency counts on the y-axis and variable values on the x-axis; it gives a sense of the distribution of the data at a glance.

  • A frequency table is a tabular version of the frequency counts found in a histogram.

  • A boxplot—with the top and bottom of the box at the 75th and 25th percentiles, respectively—also gives a quick sense of the distribution of the data; it is often used in side-by-side displays to compare distributions.

  • A density plot is a smoothed version of a histogram; it requires a function to estimate a plot based on the data (multiple estimates are possible, of course).

Standard Deviation

Remember, standard deviation is easier to interpret than variance because it is on the same scale as the original data. Standard Deviation is sensitive to outliers

sd(state$Population)
## [1] 6848235
IQR(state$Population)
## [1] 4847308
#    ?quantile # what are the potential options for the quantile command?

quantile(____): quantile(x, probs = seq(0, 1, 0.25), na.rm = FALSE, names = TRUE, type = 7, ...)

The default quantiles are to calculate the 0% (min), 25%, 50% (median), 75%, and 100% (max)

quantile(state$Population)
##       0%      25%      50%      75%     100% 
##   563626  1833004  4436370  6680312 37253956
# change the qunatiles to calculate the bottom 5% and top 95% instead of 25% and 75%
quantile(state$Population, c(0.05, .25, .5, .95, .95))
##       5%      25%      50%      95%      95% 
##   689529  1833004  4436370 19118546 19118546

boxplot() is from the graphics() package. Top and bottom of the box are the 75th and 25th percnetiles. Median is horizontal line. The dashed lines are the whiskers and indicate the range for the majority of the data. For this command, the whiskers extend to the farthest point beyond the box up until the 1.5 times the interquartile range (IQR). The IQR of the population was 4.847308^{6} so the whiskers go until + or - 1.5(4.847308^{6}), or +/- 7.270962^{6}.

boxplot(state$Population/1000000, 
        ylab = "Population (millions")

> Boxplots are useful summaries, but hide the shape of the distribution. If there was a bimodal distribution, it would not be visible with a boxplot. Another graphing alternative to the boxplot is a violin plot, where the shape (of the density of points) is drawn. Use geom_violin() for violin plots.

state %>% ggplot(aes(x=Population)) + 
  geom_boxplot() # boxplot

state %>% ggplot(aes(x=Population)) + 
  geom_density() # smoothed distribution

state %>% ggplot(aes(y=Population)) +
  geom_boxplot()

p <- state %>% ggplot(aes(y=Population)) +
  geom_boxplot() 

p    # view the basic boxplot

ggplot(state, aes( x = Population, y = Population)) +
  geom_boxplot() +
  #coord_flip() +
  geom_jitter(shape = 16, position = position_jitter(0.2)) +
  stat_summary(fun.y = mean, geom = "point", shape=23, size = 4) 
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

### Correlation

Correlation is the extent that two continuous (interval or ratio level) variables are related. A relationship exists when knowing the value of one variavble is useful for predicting the value of a second variable.

Correlation Coefficient: symmetric, scale-invariant measure of association between two variables
- Ranges from -1 to +1.
- Strength of Correlation: 0 means no correlation, -1 and +1 imply perfect correlation (either one increases as the other decreases [-1] or they move in the same direction[+1])

Pearson Correlation

  • assumes there is a normal distribution
  • cor() can be used to compute correlation between two or more vectors
  • cor.test() tells you if the correlation is significantly different than zero
cor(variable1, variable2, use = "complete.obs")
cor.test(v1, v2)
cor.test(v1, v2, method = "spearman") # Spearman's p and Kendall for nonparametric 
corcoef <- cor(state$Population, state$Murder.Rate)
corcoef
## [1] 0.1820693
coef_text <- paste("Correlation Coefficient:", round(cor(state$Population, state$Murder.Rate), digits = 3))
coef_text
## [1] "Correlation Coefficient: 0.182"

Interpretation?

Link with tons of examples of ways to customize scatter plots.

ggplot(state, aes(Population, Murder.Rate, label= Abbreviation)) +
  geom_point() +
  geom_smooth(method=lm, se=FALSE) + 
 annotate(x = 20000000, y = 8.5, geom = "text", label = coef_text) +
  theme(panel.background = element_blank())
## `geom_smooth()` using formula 'y ~ x'

Rain Example

In R, functions accept objects as inputs, manipulate the inputs in some way, and return some output. For example, the function mean(object) would return the mean of an object (assuming the object was a list of numbers). The function c() is called the Combine Function and will combine a list of numbers (or words) into a new object.

## 'rain' contains actual rainfall data for Boulder, CO (2000-2011)
rain <- c(16, 18, 14, 22, 27, 17, 19, 17, 17, 22, 20, 22)

The object “rain” contains data, we can calculate some descriptive statistics:

mean(rain) #returns the average rainfall from 2000-2011 in Boulder, CO
## [1] 19.25
sum(rain) #returns the total amount of rainfall during the study period
## [1] 231
length(rain) #returns the length of the list, i.e. the number of years of data
## [1] 12

We can also calculate deviations from the mean for each year:

rain - mean(rain)  #Deviations from the mean; negative values indicate below average rainfall.
##  [1] -3.25 -1.25 -5.25  2.75  7.75 -2.25 -0.25 -2.25 -2.25  2.75  0.75  2.75

We can use the assignment operator to save these deviations from the mean as a new object:

rainDeviations <- rain - mean(rain)
rainDeviations^2  #Squared deviations from the mean
##  [1] 10.5625  1.5625 27.5625  7.5625 60.0625  5.0625  0.0625  5.0625  5.0625
## [10]  7.5625  0.5625  7.5625
sqrt(rain)  #Square root of rainfall values
##  [1] 4.000000 4.242641 3.741657 4.690416 5.196152 4.123106 4.358899 4.123106
##  [9] 4.123106 4.690416 4.472136 4.690416

Conceptually, the standard deviation is like the average deviation from the mean. However, the average deviation from the mean is always zero. Thus, we calculate the standard deviation as:

\[s = \sqrt{s^{2}} = \sqrt{\frac{SS}{N - 1}} = \sqrt{\frac{\sum (x_{i} - \bar{x})^{2}}{N - 1}}\] where \(s^2\) is the svariance; SS is the sum of squared errors; N is the number of observations; \(x_i\) is the \(i^{th}\) score in a group; and \(\bar{x}\) is the mean of the group.

The standard deviation is the Root Mean Square (RMS) of the deviations from the mean. The above formula can be broken down into a series of simple steps:
1. Calculate the deviations from the mean (see above R code).
2. Square the deviations from the mean, save the squared deviations as a new R object (use the “<-” assignment operator).
3. Take the mean of these squared deviations. Again, save the results as an object.
4. Finally, take the square root of the result from the prior step.

mtcars Examples

Univariate Analysis

# Factor
#install.packages(dataset)
library(datasets)
data("mtcars")
mtcars
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
str(mtcars) # see the structure of the data
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
tab <- table(mtcars$cyl)
tab
## 
##  4  6  8 
## 11  7 14
prop.table(tab)
## 
##       4       6       8 
## 0.34375 0.21875 0.43750
summary(mtcars[,c("cyl" , "vs" , "am" , "gear" , "carb")])
##       cyl              vs               am              gear      
##  Min.   :4.000   Min.   :0.0000   Min.   :0.0000   Min.   :3.000  
##  1st Qu.:4.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:3.000  
##  Median :6.000   Median :0.0000   Median :0.0000   Median :4.000  
##  Mean   :6.188   Mean   :0.4375   Mean   :0.4062   Mean   :3.688  
##  3rd Qu.:8.000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:4.000  
##  Max.   :8.000   Max.   :1.0000   Max.   :1.0000   Max.   :5.000  
##       carb      
##  Min.   :1.000  
##  1st Qu.:2.000  
##  Median :2.000  
##  Mean   :2.812  
##  3rd Qu.:4.000  
##  Max.   :8.000
# Numeric
mean(mtcars$mpg)
## [1] 20.09062
median(mtcars$mpg)
## [1] 19.2
min(mtcars$mpg)
## [1] 10.4
max(mtcars$mpg)
## [1] 33.9
range(mtcars$mpg)
## [1] 10.4 33.9
quantile(mtcars$mpg)
##     0%    25%    50%    75%   100% 
## 10.400 15.425 19.200 22.800 33.900
summary(mtcars$mpg)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.40   15.43   19.20   20.09   22.80   33.90

Bivariate Analysis

# Two Numeric
data(mtcars)
View(mtcars)

cor(mtcars$mpg , mtcars$hp)
## [1] -0.7761684
cor(mtcars$disp , mtcars$hp)
## [1] 0.7909486
# One Factor & One Numeric
mtcars %>% group_by(cyl) %>% summarise(avg=mean(mpg), 
                                       median=median(mpg), 
                                       std=sd(mpg))
## # A tibble: 3 x 4
##     cyl   avg median   std
##   <dbl> <dbl>  <dbl> <dbl>
## 1     4  26.7   26    4.51
## 2     6  19.7   19.7  1.45
## 3     8  15.1   15.2  2.56