library(tidyverse)
library(knitr)
library(readxl)
#loc <- readxl::read_xls('http://statweb.lsu.edu/faculty/li/data/Forbes2000.xls')
loc <- readr::read_csv('http://statweb.lsu.edu/faculty/li/data/Forbes2000.csv')

1. Find the median profit for the companies in the US, UK, France and Germany

loc %>% filter(country %in% c('United States', 'United Kingdom', 'France', 'Germany')) %>% 
                 group_by(country) %>% 
                 summarise(median = median(profits, na.rm = T)) %>% 
  kable(caption = 'Median profit')

Median profit
country	median
France	0.190
Germany	0.230
United Kingdom	0.205
United States	0.240

2. Find all German companies with negative profit

loc %>% filter(country == 'Germany' & profits < 0) %>% distinct(name) %>% 
  kable(caption = 'German companies with negative profit')

name
Allianz Worldwide
Deutsche Telekom
E.ON
HVB-HypoVereinsbank
Commerzbank
Infineon Technologies
BHW Holding
Bankgesellschaft Berlin
W&W-Wustenrot
mg technologies
Nurnberger Beteiligungs
SPAR Handels
Mobilcom

3. Find the business category to which most of the Bermuda island companies belong.

loc %>% filter(country == 'Bermuda') %>% count(category) %>% top_n(1) %>% kable()

category	n
Insurance	10

4. Find the 50 companies in the Forbes dataset with the highest profit

loc %>% arrange(desc(profits)) %>% top_n(50) %>% select(name, profits) %>% kable()

name	profits
ExxonMobil	20.96
Citigroup	17.85
General Electric	15.59
Bank of America	10.81
BP	10.27
Altria Group	9.20
Wal-Mart Stores	9.05
Microsoft	8.88
Total	8.84
Royal Dutch/Shell Group	8.40
Toyota Motor	7.99
IBM	7.58
ChevronTexaco	7.43
Merck & Co	7.33
Berkshire Hathaway	6.95
Johnson & Johnson	6.74
HSBC Group	6.66
Fannie Mae	6.48
American Intl Group	6.46
GlaxoSmithKline	6.34
Pfizer	6.20
Wells Fargo	6.20
SBC Communications	5.97
Procter & Gamble	5.81
PetroChina	5.67
Intel	5.64
Nestle	5.48
Novartis Group	5.40
UBS	5.15
Royal Bank of Scotland	4.95
ENI	4.82
Nokia	4.52
JP Morgan Chase	4.47
Cisco Systems	4.35
Coca-Cola	4.35
Home Depot	4.04
United Parcel Service	3.54
PepsiCo	3.49
AstraZeneca	3.29
Siemens Group	2.81
Time Warner	2.65
Dell	2.65
Verizon Commun	2.57
Eli Lilly and Co	2.56
Roche Group	2.48
Amgen	2.26
Nippon Tel & Tel	2.17
Telefsnica	-5.86
Vodafone	-15.51
Deutsche Telekom	-25.83

Plot sales against assets, labelling each point with approriate country name which may need to be abbreviated (using abbreviate) to avoid makeing the plot look too messy

I calculate the mean for each country and plot a barchart because otherwise the plot is unreadable.

library(tidyr)
loc %>% group_by(country) %>% summarise(sales = mean(sales, na.rm = T), assets = mean(assets, na.rm = T)) %>% 
  gather(item, value, assets, sales) %>% 
ggplot(data = ., aes(x = country, y = value, fill=item)) +
  geom_bar(stat = 'identity') +
  theme( axis.text.x = element_text(angle = 45, hjust=1)) +
  ggtitle('Plot of sales against assets', subtitle = 'Bars represent the average for all companies in each country')

5. Find the average value of sales for the companies in each country

loc %>% group_by(country) %>% summarise(avgSales = mean(sales, na.rm = T)) %>% kable(caption = 'Average values of sales by country')

Average values of sales by country
country	avgSales
Africa	6.820000
Australia	5.244595
Australia/ United Kingdom	11.595000
Austria	4.142500
Bahamas	1.350000
Belgium	10.114444
Bermuda	6.840500
Brazil	6.338667
Canada	6.429643
Cayman Islands	1.660000
Chile	1.602500
China	5.099600
Czech Republic	1.805000
Denmark	6.349000
Finland	10.291818
France	20.102063
France/ United Kingdom	1.010000
Germany	20.781385
Greece	2.528333
Hong Kong/China	2.044000
Hungary	3.370000
India	3.868148
Indonesia	2.450000
Ireland	4.765000
Islands	6.670000
Israel	2.060000
Italy	10.213902
Japan	10.190633
Jordan	1.330000
Kong/China	5.717500
Korea	15.005000
Liberia	3.780000
Luxembourg	14.185000
Malaysia	1.716250
Mexico	3.937647
Netherlands	17.020714
Netherlands/ United Kingdom	92.100000
New Zealand	2.640000
Norway	10.780000
Pakistan	1.230000
Panama/ United Kingdom	5.930000
Peru	0.170000
Philippines	1.565000
Poland	4.410000
Portugal	3.884286
Russia	7.672500
Singapore	3.685000
South Africa	4.124000
South Korea	7.969333
Spain	7.843448
Sweden	7.665769
Switzerland	12.456765
Taiwan	2.751429
Thailand	2.513333
Turkey	4.713333
United Kingdom	10.445109
United Kingdom/ Australia	10.010000
United Kingdom/ Netherlands	7.540000
United Kingdom/ South Africa	2.060000
United States	10.058256
Venezuela	0.980000

6. Find the number of companies in each country with profits above 5 billion US dollars

loc %>% filter(profits > 5) %>% 
  count(country) %>% arrange(country) %>% kable(caption = 'Number of companies with profits >5 bill.')

Number of companies with profits >5 bill.
country	n
China	1
France	1
Germany	1
Japan	1
Netherlands/ United Kingdom	1
South Korea	1
Switzerland	3
United Kingdom	3
United States	20

7. Fit a logistic regression model on the South African Heart Disease Dataset.

The dataset is available from the Elements of Statistical Learning (EOSL) webpage: https://web.stanford.edu/~hastie/ElemStatLearn/ You can also see the data description on EOSL webpage. You can get the data into R by the following code in R.

eosl <- read.table("http://statweb.lsu.edu/faculty/li/data/SAheart.txt",
               sep=",", header=T, row.names=1)

Set the ‘Present’ as 1 and ‘Absent’ as 0 for variable ‘famhist’.

levels(eosl$famhist) <- c(0, 1)
eosl$famhist <- as.numeric(as.character(eosl$famhist))

7.b There are 462 observations in the dataset. Randomly split the dataset into 400 observations as the training set. The rest 62 observations as the test set.

set.seed(4)
train <- sample(1:nrow(eosl), 400, replace = F)
trainDt <- eosl[train,]
testDt <- eosl[-train,]

7.c Then fit a logistic regression using ‘famhist’ (now become 0 and 1 binary variable) as the response and all the other variables as the explanatory variables.

fit <- glm(famhist ~ . , family = binomial(link='logit'), data = trainDt)
summary(fit)

## 
## Call:
## glm(formula = famhist ~ ., family = binomial(link = "logit"), 
##     data = trainDt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6942  -0.9711  -0.6613   1.1294   1.9676  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -3.249495   1.167428  -2.783  0.00538 **
## sbp         -0.003567   0.005800  -0.615  0.53861   
## tobacco     -0.032501   0.025340  -1.283  0.19964   
## ldl          0.061218   0.059692   1.026  0.30510   
## adiposity   -0.002335   0.026535  -0.088  0.92988   
## typea        0.010253   0.011543   0.888  0.37443   
## obesity      0.040684   0.038184   1.065  0.28667   
## alcohol      0.006697   0.004611   1.452  0.14638   
## age          0.028780   0.011028   2.610  0.00906 **
## chd          0.763461   0.249899   3.055  0.00225 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 539.99  on 399  degrees of freedom
## Residual deviance: 496.52  on 390  degrees of freedom
## AIC: 516.52
## 
## Number of Fisher Scoring iterations: 4

7.d) Make the prediction on the training and test sets. Using the 0.5 as the cutoff point to get the misclassification rate on the training and test sets, respectively.

predictTest <- as.numeric(predict(fit, testDt[, -which(names(testDt) == 'famhist')], type = 'response') > .5)
predictTraining <- as.numeric(predict(fit, trainDt[, -which(names(testDt) == 'famhist')], type = 'response') > .5)
tibble(value = list(test = testDt$famhist, predictTest = predictTest, training = trainDt$famhist, predictTraining = predictTraining)) %>% 
  mutate(set = names(value)) %>% 
  unnest() %>% 
  group_by(set) %>% summarise(positive = sum(value), negative = sum(value == 0)) -> predictionTable
  #gather(outcome, number, positive, negative) -> 
kable(predictionTable, caption = 'Prediction table with the fitted model')

Prediction table with the fitted model
set	positive	negative
predictTest	26	36
predictTraining	117	283
test	30	32
training	162	238

predictionTable %>% filter(set %in% c('predictTest', 'test')) %>% 
  gather(outcome, number, positive, negative) %>% 
  group_by(outcome) %>% 
  summarise(ratio = number[set == 'predictTest']/number[set =='test']) %>% 
  mutate(ratio = scales::percent(ratio)) %>% 
  kable(caption = 'Percentage of true predictions for the test dataset')

Percentage of true predictions for the test dataset
outcome	ratio
negative	112.5%
positive	86.7%

predictionTable %>% filter(set %in% c('predictTraining', 'training')) %>% 
  gather(outcome, number, positive, negative) %>% 
  group_by(outcome) %>% 
  summarise(ratio = number[set == 'predictTraining']/number[set =='training']) %>% 
  mutate(ratio = scales::percent(ratio)) %>% 
  kable(caption = 'Percentage of true predictions for the training dataset')

Percentage of true predictions for the training dataset
outcome	ratio
negative	118.9%
positive	72.2%

7.e) Find the AUC score and plot the ROC curve based on the test set performance.

library(AUC)
# sensibility <- function(x) {
#  tibble(outcome = testDt$famhist,
#        prob = predict(fit, select(trainDt, -famhist), type='response')) %>% 
#   mutate(predicted = ifelse(prob >= x, 1, 0)) %>%
#   count(outcome, predicted) %>% 
#   complete(outcome = c(0,1), predicted = c(0,1), fill = list(n = 0)) %>% 
#   group_by(outcome) %>% 
#   summarise(ratio = n[predicted == outcome]/ sum(n))
#   }
# seq(0, 1, .01) %>% map(sensibility) -> tt
# tibble(truePositive = unlist(tt %>% map('ratio') %>% map(`[`,1)),
#        falsePositive = 1 - unlist(tt %>% map('ratio') %>% map(`[`,2))) -> dd

dtROC <- roc(predict(fit, select(testDt, -famhist), type='response'), factor(testDt$famhist))
ggplot() +
  geom_line(aes(x = dtROC$fpr, y = dtROC$tpr)) +
  geom_segment(aes(x = 0, y = 0, xend = 1, yend = 1, colour = "segment")) +
  ggtitle('ROC curve') +
  xlab('False positive - 1-sensitivity') +
  ylab('True Positive - specificity') +
  theme(legend.position = 'none')

#AUC score
auc(dtROC)

## [1] 0.703125

Homework2

Dario Bonaretti

8/30/2017