Chapter 1 : Descriptive Statistics

Thirty students in the School of Business were asked what their majors were. The following represents their responses (M = Management; A = Accounting; E = Economics; O = Others).

A M M A M M E M O A E E M A O E M A M A M A O A M E E M A M

Construct a frequency distribution and a bar graph.
Construct a relative frequency distribution and a pie chart.

Ans.

Majors = data.frame(Majors=c("A","M","M","A","M","M","E","M","O","A","E","E","M","A","O","E","M","A","M","A","M","A","O","A","M","E","E","M","A","M"))
frequency = table(Majors)
relativefrequency = frequency / nrow(Majors)
frequencydistribution = cbind(frequency)
relativefrequencydistribution = cbind(relativefrequency)
sink("Results.txt")
frequencydistribution

##   frequency
## A         9
## E         6
## M        12
## O         3

relativefrequencydistribution

##   relativefrequency
## A               0.3
## E               0.2
## M               0.4
## O               0.1

sink()
write.csv(Majors,"Majors.csv")
barplot(frequency)

barplot(frequency,xlab = "Majors",ylab = "Frequency",col= c('Red','Blue','Green','Yellow' ))

lbls = paste(names(frequency), "\n", sep="")
pct = round(frequency/sum(frequency)*100)
lbls = paste(lbls, pct) # add percents to labels
lbls = paste(lbls,"%",sep="") # ad % to labels
pie(frequency,labels = lbls, col=rainbow(length(frequency)))

A sample of 9 mothers was taken. The mothers were asked the age of their oldest child. You are given their responses below.

3 12 4 7 14 6 2 9 11

Compute the mean.
Compute the variance.
Compute the standard deviation.
Compute the coefficient of variation.
Determine the 25th percentile.
Determine the median
Determine the 75th percentile.
Determine the range.

Ans.

library(psych)
data_set = data.frame(Age = c(3,12,4,7,14,6,2,9,11))
summary(data_set$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   4.000   7.000   7.556  11.000  14.000

fivenum(data_set$Age)

## [1]  2  4  7 11 14

describe(data_set$Age)

A student has completed 20 courses in the School of Arts and Sciences. Her grades in the 20 courses are shown below.

A B A B C
C C B B B
B A B B B
C B C B A

Develop a frequency distribution and a bar graph for her grades.

Ans.

Grade  =  c("A","C","B","C","B","C","A","B","A","B","B","C","B","B","B","B","C","B","B","A")
dataset = data.frame(Grade)
frequency = table(dataset)
frequencydistribution = data.frame( cbind(frequency))
frequencydistribution

lbls = paste(row.names(frequencydistribution),frequency, "\n", sep="")
barplot(table(dataset),legend.text = lbls,col=rainbow(length(frequency)),args.legend = lbls)

Develop a relative frequency distribution for her grades and construct a pie chart.

Ans.

Grade  =  c("A","C","B","C","B","C","A","B","A","B","B","C","B","B","B","B","C","B","B","A")
dataset = data.frame(Grade)
frequency =  table(dataset)
lbls = paste(names(frequency), "\n", sep="")
pct = round(frequency/sum(frequency)*100)
lbls = paste(lbls, pct) # add percents to labels
lbls = paste(lbls,"%",sep="") # ad % to labels
pie(frequency,labels = lbls, col=rainbow(length(frequency)))

4.The ages of a sample of 8 faculty members selected from the School of Business Administration are shown below.

Compute the average age.
Determine the mode.
Compute the median age.
Compute the standard deviation.

Ans.

library(psych)
Faculty =  c(1:8)
Age     =  c(42,30,73,50,51,37,42,59)
dataset =  data.frame(Faculty,Age)
summary(dataset$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   30.00   40.75   46.00   48.00   53.00   73.00

fivenum(dataset$Age)

## [1] 30.0 39.5 46.0 55.0 73.0

describe(dataset$Age)

Marissa, a freshman at a local college, just completed 18 credit hours. Her grade report is presented below.

##     Course Credithours Grade
## 1 Chemisry           5     C
## 2 Calculus           5     A
## 3  English           4     C
## 4    Music           3     F
## 5       PE           1     A

The local university uses a 4 point grading system, i.e., A = 4, B = 3, C = 2, D = 1, F = 0. Compute Marissa’s semester grade point average.

Course =  c("Chemisry","Calculus","English","Music","PE")
Credithours = c(5,5,4,3,1)
Grade =  c("C","A","C","F","A")
Report = data.frame(Course,Credithours,Grade)
equi = function(grade) {
if(grade == "A")
  grade = 4
if(grade == "B")
  grade = 3
if(grade == "C")
  grade = 2
if(grade == "D")
  grade = 1
if(grade == "F")
  grade = 0

grade
}

Report$gradenumeric  =  lapply(Report$Grade,equi)
cgpa = function(Credithours,Grade){
  score = Credithours * Grade
  score
  }
Report$gpa = mapply(cgpa,Report$Credithours,Report$gradenumeric)
gpacalc =  sum(Report$gpa)/sum(Report$Credithours)
cat(sprintf("\n Grade Point Average = %f\n",gpacalc))

## 
##  Grade Point Average = 2.333333

For the following observations, plot a scatter diagram and indicate what kind of relationship (if any) exist between x and y.

##   x  y
## 1 2  7
## 2 6 19
## 3 3  9
## 4 5 17
## 5 4 11

A positive relationship between x and y appears to exist as per the scatterplot below.

x  = c(2,6,3,5,4)
y = c(7,19,9,17,11)  
dataset = data.frame(x,y)
plot(dataset,col = "blue")

pairs(~x+y,data=dataset, 
   main="Simple Scatterplot Matrix of x and y")

For the following observations, plot a scatter diagram and indicate what kind of relationship (if any) exist between x and y.

##   x  y
## 1 8  4
## 2 5  5
## 3 3  9
## 4 2 12
## 5 1 14

A negative relationship between x and y appears to exist as per the scatterplot below.

x  = c(8,5,3,2,1)
y = c(4,5,9,12,14) 
dataset = data.frame(x,y)
plot(dataset,col = "red")

pairs(~x+y,data=dataset, 
   main="Simple Scatterplot Matrix of x and y")

This is with reference to the dataset of iris flowers.

Compare visually the different quantitative attributes of different iris flower namely setosa, versicolor and virginica.
Visualize whether the quantitative variables are correlated with each other.
Visualize the distribution of the quantitative variables.

library(datasets)
library(corrgram)

## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang

## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus

data("iris")
ds =  iris
head(iris)

boxplot(ds$Sepal.Length ~ ds$Species,col = "red", xlab = "Species", ylab = "Sepal Length")

boxplot(ds$Sepal.Width ~ ds$Species,col = "blue",xlab = "Species",ylab = "Sepal Width")

boxplot(ds$Petal.Length ~ ds$Species,col = "green",xlab = "Species",ylab = "Petal Length")

boxplot(ds$Petal.Width ~ ds$Species,col = "orange",xlab = "Species", ylab = "Petal Width")

cat(sprintf("\n From the boxplots, it can be concluded that Petal Width differentiates the different species of the flower.\n"))

## 
##  From the boxplots, it can be concluded that Petal Width differentiates the different species of the flower.

corrgram(ds,order = TRUE,lower.panel = panel.shade,upper.panel = panel.pie,main ="Corrgram")

cat(sprintf("\nFrom the corrgram , it can be concluded that  Petal Width and Petal Length are highly positively correlated.\n"))

## 
## From the corrgram , it can be concluded that  Petal Width and Petal Length are highly positively correlated.

hist(ds$Sepal.Length,xlab = "Sepal Length",col = "red")

hist(ds$Sepal.Width,xlab = "Sepal Width",col = "blue")

hist(ds$Petal.Length,xlab = "Petal Length",col = "green")

hist(ds$Petal.Width,xlab = "Petal Width",col = "orange")

The results of a recent study regarding smoking and three types of illness are shown in the following table.

## 
##    Cell Contents
## |-------------------------|
## |                   Count |
## |-------------------------|
## 
## Total Observations in Table:  300 
## 
##              | Smoking 
##      Disease | Non-Smokers  |     Smokers  |   Row Total | 
## -------------|-------------|-------------|-------------|
##    Emphysema |         20  |         60  |         80  | 
## -------------|-------------|-------------|-------------|
## HeartProblem |         70  |         80  |        150  | 
## -------------|-------------|-------------|-------------|
##       Cancer |         30  |         40  |         70  | 
## -------------|-------------|-------------|-------------|
## Column Total |        120  |        180  |        300  | 
## -------------|-------------|-------------|-------------|
## 
##  
## NULL

We are interested in visualizing whether or not illness is independent of smoking.

datamatrix =  matrix(c(20,70,30,60,80,40),nrow = 3,ncol = 2)
row.names(datamatrix) = c("Emphysema","HeartProblem","Cancer")
colnames(datamatrix) = c("Non-Smokers","Smokers")
prop.table(datamatrix)

##              Non-Smokers   Smokers
## Emphysema     0.06666667 0.2000000
## HeartProblem  0.23333333 0.2666667
## Cancer        0.10000000 0.1333333

assocplot(datamatrix,col = c("green","red"))

From the association plot , it can be concluded that illness is not independent of smoking. Green colour indicates positive association between the corresponding categories whereas Red colour indicates negative association between the corresponding categories.

There are 800 students in the School of Business Administration. There are four majors in the School: Accounting, Finance, Management, and Marketing. The following shows the number of students in each major.

Develop a percent frequency distribution.

Ans.

Major =  c("Accounting", "Finance","Management","Marketing")
Number_of_Students  = c(240,160,320,80)
dataset =  data.frame(Major, Number_of_Students)
dataset$Percentage_Students =  dataset$Number_of_Students/sum(dataset$Number_of_Students)*100
dataset