R Exercise

Part 1: Temperature Analysis (15 total points)

1.1 Create initial data frame

date <- c(as.Date("9/24/2016","%m/%d/%Y"),
          as.Date("9/25/2016","%m/%d/%Y"),
          as.Date("9/26/2016","%m/%d/%Y"),
          as.Date("9/27/2016","%m/%d/%Y"),
          as.Date("9/28/2016","%m/%d/%Y"),
          as.Date("9/29/2016","%m/%d/%Y"),
          as.Date("9/30/2016","%m/%d/%Y"))

nash_temp <- c(92,94,78,79,84,67,68)
tor_temp <- c(18,16,19,21,21,19,18)
temp.diff.df <- data.frame(date,nash_temp,tor_temp)

# View(temp.diff.df)

1.2 Add to existing data frame

temp.diff.df <- mutate(temp.diff.df,
                       tor_temp_f = tor_temp * 1.8 + 32,
                       temp_diff = nash_temp - tor_temp_f,
                       pct_temp_diff = temp_diff/tor_temp_f)

# View(temp.diff.df)


# Do not modify the following code:
knitr::kable(temp.diff.df, format = "markdown")

date	nash_temp	tor_temp	tor_temp_f	temp_diff	pct_temp_diff
2016-09-24	92	18	64.4	27.6	0.4285714
2016-09-25	94	16	60.8	33.2	0.5460526
2016-09-26	78	19	66.2	11.8	0.1782477
2016-09-27	79	21	69.8	9.2	0.1318052
2016-09-28	84	21	69.8	14.2	0.2034384
2016-09-29	67	19	66.2	0.8	0.0120846
2016-09-30	68	18	64.4	3.6	0.0559006

1.3 Calculate mean percentage difference

avg = round(mean(temp.diff.df$pct_temp_diff) *100,digits=4)
avg

## [1] 22.23

The mean temperature difference was 22.23%.

1.4 Plot the percent difference per day

ggplot(temp.diff.df, aes(x=date,y=pct_temp_diff)) +
  geom_bar(stat="identity",color="black", fill="gray")+
  labs(x="date", y="Percent Defference",
       title= "Nashville vs Toronto Temperature")

maxDiff = max(temp.diff.df$pct_temp_diff)
highestDate = subset(temp.diff.df,pct_temp_diff==maxDiff,select = date)

The percent difference was greatest on 2016-09-25.

Part 2: Analysis of Fish Data (35 total points)

2.1 Import the data

fish <- read.csv("https://ww2.amstat.org/publications/jse/datasets/fishcatch.dat.txt",
                          header = FALSE, 
                          sep = '')
colnames(fish) <- c('obs', 'species', 'weight', 'len1', 'len2', 'len3', 'height.pct', 'width.pct', 'sex')
# View(fish)

2.2 Change ‘sex’ to be a factor

fish$sex <- factor(fish$sex, labels = c("female","male"))

2.3 Change ‘species’ to be a factor and add labels

fish$species <- factor(fish$species, labels = c(
  'Common Bream','Whitefish','Roach','Silver Bream','Smelt','Pike','Perch')
)


# Do not modify the following code:
fish.sub <- filter(fish, sex != "NA")
knitr::kable(head(fish.sub), format = "markdown")

obs	species	weight	len1	len2	len3	height.pct	width.pct	sex
14	Common Bream	NA	29.5	32	37.3	37.3	13.6	male
15	Common Bream	600	29.4	32	37.2	40.2	13.9	male
17	Common Bream	700	30.4	33	38.3	38.8	13.8	male
21	Common Bream	575	31.3	34	39.5	38.3	14.1	male
26	Common Bream	725	31.8	35	40.9	40.0	14.8	male
30	Common Bream	1000	33.5	37	42.6	44.5	15.5	female

2.4 Determine mean weight for each species

by_species <- group_by(fish,species)
mean.wt <- summarise(by_species,
                   average.weight = mean(weight, na.rm = TRUE)
                  )

minWeight <- min(mean.wt$average.weight)

minspecies <- subset(mean.wt,average.weight == minWeight)[,1]




# Do not modify the following code:
knitr::kable(mean.wt, format = "markdown")

species	average.weight
Common Bream	626.00000
Whitefish	531.00000
Roach	152.05000
Silver Bream	154.81818
Smelt	11.17857
Pike	718.70588
Perch	382.23929

The species with the smallest mean weight is Smelt with a weight of 11.1785714g.

2.5 Plot the mean weights for each species

ggplot(mean.wt, aes(x=species,y=average.weight)) +
  geom_bar(stat="identity",color="black", fill="gray")+
  labs(x="Species", y="Weight",
       title= "Mean Weight per Species")

maxDiff = max(temp.diff.df$pct_temp_diff)
highestDate = subset(temp.diff.df,pct_temp_diff==maxDiff,select = date)

mean.wt

## # A tibble: 7 × 2
##        species average.weight
##         <fctr>          <dbl>
## 1 Common Bream      626.00000
## 2    Whitefish      531.00000
## 3        Roach      152.05000
## 4 Silver Bream      154.81818
## 5        Smelt       11.17857
## 6         Pike      718.70588
## 7        Perch      382.23929

Part 3: Analysis of Forbes Global 2000 Data (50 total points)

3.1 Import the dataset

library(readr)
library(dplyr)

Forbes <- read_csv("2014 Forbes Global 2000.csv")

## Parsed with column specification:
## cols(
##   Rank = col_integer(),
##   Company = col_character(),
##   Sector = col_character(),
##   Industry = col_character(),
##   Continent = col_character(),
##   Country = col_character(),
##   Sales = col_double(),
##   Profits = col_double(),
##   Assets = col_double(),
##   Market_Value = col_double()
## )

3.2 Exclude specified records

Forbes <-
  filter(Forbes,is.na(Forbes$Sector)==FALSE,Sales!=0)

3.3 Convert four variables to factors

Forbes$Sector <- factor(Forbes$Sector)
Forbes$Industry <- factor(Forbes$Industry)
Forbes$Continent <- factor(Forbes$Continent)
Forbes$Country <- factor(Forbes$Country)

3.4 Create mosaic plot

# ggplot(data = Forbes) +
#   geom_mosaic(aes(x=Sector,y=Continent))

SectorContinent <- table(Forbes$Sector, Forbes$Continent)
mosaicplot(SectorContinent, color = c("red","green","blue","yellow","brown","purple"), main = "Sector by Continent")

Forbes.CC <- Forbes %>%
  dplyr::select(Sector) %>%
  group_by(Sector) %>%
  summarise(CompanyCount = n()) 


answer1 <- subset(Forbes.CC,CompanyCount==max(Forbes.CC$CompanyCount))[,1]

Forbes.NACC <- Forbes %>%
  dplyr::select(Sector,Continent) %>%
  group_by(Sector) %>%
  filter(Continent == "North America") %>%
  summarise(CompanyCount = n()) 

answer2 <- subset(Forbes.NACC,CompanyCount == max(Forbes.NACC$CompanyCount))[,1]

The Financials sector contains the largest number of companies.
North America’s largest sector (in terms of the number of companies) is Financials.

3.5 Create Profit Margin variable

Forbes <- mutate(Forbes, ProfMgn = Profits/Sales)


# Do not modify the following code:
knitr::kable(head(Forbes), format = "markdown")

Rank	Company	Sector	Industry	Continent	Country	Sales	Profits	Assets	Market_Value	ProfMgn
1	ICBC	Financials	Major Banks	Asia	China	148.7	42.7	3124.9	215.6	0.2871553
2	China Construction Bank	Financials	Regional Banks	Asia	China	121.3	34.2	2449.5	174.4	0.2819456
3	Agricultural Bank of China	Financials	Regional Banks	Asia	China	136.4	27.0	2405.4	141.1	0.1979472
4	JPMorgan Chase	Financials	Major Banks	North America	United States	105.7	17.3	2435.3	229.7	0.1636708
5	Berkshire Hathaway	Financials	Investment Services	North America	United States	178.8	19.5	493.4	309.1	0.1090604
6	Exxon Mobil	Energy	Oil & Gas Operations	North America	United States	394.0	32.6	346.8	422.3	0.0827411

3.6 Create boxplot of Profit Margin by Sector

p <- ggplot(Forbes, aes(Sector,ProfMgn)) 
  p + coord_flip() +  geom_boxplot()

The sector that appears to have the greatest standard deviation is Consumer Discretionary.

3.7 Calculate the SD for each sector

Forbes.SD <- Forbes %>%
  dplyr::select(Sector,ProfMgn) %>%
  group_by(Sector) %>%
  # filter(Year.Built >= 2000) %>%
  # mutate(totalObs = 93) %>%
  summarise(StDev = sd(ProfMgn)) #%>%
  # filter(count >= 10) %>%
  # arrange(desc(meanPricePerSqFt))

SDanswer <- subset(Forbes.SD, StDev == max(Forbes.SD$StDev))[,1]

# Do not modify the following code:
knitr::kable(Forbes.SD, format = "markdown")

Sector	StDev
Consumer Discretionary	0.6289455
Consumer Staples	0.1000578
Energy	0.1058560
Financials	0.4052307
Health Care	0.1074421
Industrials	0.0993903
Information Technology	0.2345233
Materials	0.2154817
Telecommunication Services	0.1022383
Utilities	0.1623265

The sector that has the greatest standard deviation is Consumer Discretionary. Consumer Discretionary looks to have one strong outlier that pulls that standard deviation above the Financials sector.