library(tidyverse)
library(knitr)
library(readxl)
#loc <- readxl::read_xls('http://statweb.lsu.edu/faculty/li/data/Forbes2000.xls')
loc <- readr::read_csv('http://statweb.lsu.edu/faculty/li/data/Forbes2000.csv')

1. Find the median profit for the companies in the US, UK, France and Germany

loc %>% filter(country %in% c('United States', 'United Kingdom', 'France', 'Germany')) %>% 
                 group_by(country) %>% 
                 summarise(median = median(profits, na.rm = T)) %>% 
  kable(caption = 'Median profit')
Median profit
country median
France 0.190
Germany 0.230
United Kingdom 0.205
United States 0.240

2. Find all German companies with negative profit

loc %>% filter(country == 'Germany' & profits < 0) %>% distinct(name) %>% 
  kable(caption = 'German companies with negative profit')
name
Allianz Worldwide
Deutsche Telekom
E.ON
HVB-HypoVereinsbank
Commerzbank
Infineon Technologies
BHW Holding
Bankgesellschaft Berlin
W&W-Wustenrot
mg technologies
Nurnberger Beteiligungs
SPAR Handels
Mobilcom

3. Find the business category to which most of the Bermuda island companies belong.

loc %>% filter(country == 'Bermuda') %>% count(category) %>% top_n(1) %>% kable()
category n
Insurance 10

4. Find the 50 companies in the Forbes dataset with the highest profit

loc %>% arrange(desc(profits)) %>% top_n(50) %>% select(name, profits) %>% kable()
name profits
ExxonMobil 20.96
Citigroup 17.85
General Electric 15.59
Bank of America 10.81
BP 10.27
Altria Group 9.20
Wal-Mart Stores 9.05
Microsoft 8.88
Total 8.84
Royal Dutch/Shell Group 8.40
Toyota Motor 7.99
IBM 7.58
ChevronTexaco 7.43
Merck & Co 7.33
Berkshire Hathaway 6.95
Johnson & Johnson 6.74
HSBC Group 6.66
Fannie Mae 6.48
American Intl Group 6.46
GlaxoSmithKline 6.34
Pfizer 6.20
Wells Fargo 6.20
SBC Communications 5.97
Procter & Gamble 5.81
PetroChina 5.67
Intel 5.64
Nestle 5.48
Novartis Group 5.40
UBS 5.15
Royal Bank of Scotland 4.95
ENI 4.82
Nokia 4.52
JP Morgan Chase 4.47
Cisco Systems 4.35
Coca-Cola 4.35
Home Depot 4.04
United Parcel Service 3.54
PepsiCo 3.49
AstraZeneca 3.29
Siemens Group 2.81
Time Warner 2.65
Dell 2.65
Verizon Commun 2.57
Eli Lilly and Co 2.56
Roche Group 2.48
Amgen 2.26
Nippon Tel & Tel 2.17
Telefsnica -5.86
Vodafone -15.51
Deutsche Telekom -25.83

Plot sales against assets, labelling each point with approriate country name which may need to be abbreviated (using abbreviate) to avoid makeing the plot look too messy

I calculate the mean for each country and plot a barchart because otherwise the plot is unreadable.

library(tidyr)
loc %>% group_by(country) %>% summarise(sales = mean(sales, na.rm = T), assets = mean(assets, na.rm = T)) %>% 
  gather(item, value, assets, sales) %>% 
ggplot(data = ., aes(x = country, y = value, fill=item)) +
  geom_bar(stat = 'identity') +
  theme( axis.text.x = element_text(angle = 45, hjust=1)) +
  ggtitle('Plot of sales against assets', subtitle = 'Bars represent the average for all companies in each country')

5. Find the average value of sales for the companies in each country

loc %>% group_by(country) %>% summarise(avgSales = mean(sales, na.rm = T)) %>% kable(caption = 'Average values of sales by country')
Average values of sales by country
country avgSales
Africa 6.820000
Australia 5.244595
Australia/ United Kingdom 11.595000
Austria 4.142500
Bahamas 1.350000
Belgium 10.114444
Bermuda 6.840500
Brazil 6.338667
Canada 6.429643
Cayman Islands 1.660000
Chile 1.602500
China 5.099600
Czech Republic 1.805000
Denmark 6.349000
Finland 10.291818
France 20.102063
France/ United Kingdom 1.010000
Germany 20.781385
Greece 2.528333
Hong Kong/China 2.044000
Hungary 3.370000
India 3.868148
Indonesia 2.450000
Ireland 4.765000
Islands 6.670000
Israel 2.060000
Italy 10.213902
Japan 10.190633
Jordan 1.330000
Kong/China 5.717500
Korea 15.005000
Liberia 3.780000
Luxembourg 14.185000
Malaysia 1.716250
Mexico 3.937647
Netherlands 17.020714
Netherlands/ United Kingdom 92.100000
New Zealand 2.640000
Norway 10.780000
Pakistan 1.230000
Panama/ United Kingdom 5.930000
Peru 0.170000
Philippines 1.565000
Poland 4.410000
Portugal 3.884286
Russia 7.672500
Singapore 3.685000
South Africa 4.124000
South Korea 7.969333
Spain 7.843448
Sweden 7.665769
Switzerland 12.456765
Taiwan 2.751429
Thailand 2.513333
Turkey 4.713333
United Kingdom 10.445109
United Kingdom/ Australia 10.010000
United Kingdom/ Netherlands 7.540000
United Kingdom/ South Africa 2.060000
United States 10.058256
Venezuela 0.980000

6. Find the number of companies in each country with profits above 5 billion US dollars

loc %>% filter(profits > 5) %>% 
  count(country) %>% arrange(country) %>% kable(caption = 'Number of companies with profits >5 bill.')
Number of companies with profits >5 bill.
country n
China 1
France 1
Germany 1
Japan 1
Netherlands/ United Kingdom 1
South Korea 1
Switzerland 3
United Kingdom 3
United States 20

7. Fit a logistic regression model on the South African Heart Disease Dataset.

The dataset is available from the Elements of Statistical Learning (EOSL) webpage: https://web.stanford.edu/~hastie/ElemStatLearn/ You can also see the data description on EOSL webpage. You can get the data into R by the following code in R.

eosl <- read.table("http://statweb.lsu.edu/faculty/li/data/SAheart.txt",
               sep=",", header=T, row.names=1) 

Set the ‘Present’ as 1 and ‘Absent’ as 0 for variable ‘famhist’.

levels(eosl$famhist) <- c(0, 1)
eosl$famhist <- as.numeric(as.character(eosl$famhist))

7.b There are 462 observations in the dataset. Randomly split the dataset into 400 observations as the training set. The rest 62 observations as the test set.

set.seed(4)
train <- sample(1:nrow(eosl), 400, replace = F)
trainDt <- eosl[train,]
testDt <- eosl[-train,]

7.c Then fit a logistic regression using ‘famhist’ (now become 0 and 1 binary variable) as the response and all the other variables as the explanatory variables.

fit <- glm(famhist ~ . , family = binomial(link='logit'), data = trainDt)
summary(fit)
## 
## Call:
## glm(formula = famhist ~ ., family = binomial(link = "logit"), 
##     data = trainDt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6942  -0.9711  -0.6613   1.1294   1.9676  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -3.249495   1.167428  -2.783  0.00538 **
## sbp         -0.003567   0.005800  -0.615  0.53861   
## tobacco     -0.032501   0.025340  -1.283  0.19964   
## ldl          0.061218   0.059692   1.026  0.30510   
## adiposity   -0.002335   0.026535  -0.088  0.92988   
## typea        0.010253   0.011543   0.888  0.37443   
## obesity      0.040684   0.038184   1.065  0.28667   
## alcohol      0.006697   0.004611   1.452  0.14638   
## age          0.028780   0.011028   2.610  0.00906 **
## chd          0.763461   0.249899   3.055  0.00225 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 539.99  on 399  degrees of freedom
## Residual deviance: 496.52  on 390  degrees of freedom
## AIC: 516.52
## 
## Number of Fisher Scoring iterations: 4

7.d) Make the prediction on the training and test sets. Using the 0.5 as the cutoff point to get the misclassification rate on the training and test sets, respectively.

predictTest <- as.numeric(predict(fit, testDt[, -which(names(testDt) == 'famhist')], type = 'response') > .5)
predictTraining <- as.numeric(predict(fit, trainDt[, -which(names(testDt) == 'famhist')], type = 'response') > .5)
tibble(value = list(test = testDt$famhist, predictTest = predictTest, training = trainDt$famhist, predictTraining = predictTraining)) %>% 
  mutate(set = names(value)) %>% 
  unnest() %>% 
  group_by(set) %>% summarise(positive = sum(value), negative = sum(value == 0)) -> predictionTable
  #gather(outcome, number, positive, negative) -> 
kable(predictionTable, caption = 'Prediction table with the fitted model')
Prediction table with the fitted model
set positive negative
predictTest 26 36
predictTraining 117 283
test 30 32
training 162 238
predictionTable %>% filter(set %in% c('predictTest', 'test')) %>% 
  gather(outcome, number, positive, negative) %>% 
  group_by(outcome) %>% 
  summarise(ratio = number[set == 'predictTest']/number[set =='test']) %>% 
  mutate(ratio = scales::percent(ratio)) %>% 
  kable(caption = 'Percentage of true predictions for the test dataset') 
Percentage of true predictions for the test dataset
outcome ratio
negative 112.5%
positive 86.7%
predictionTable %>% filter(set %in% c('predictTraining', 'training')) %>% 
  gather(outcome, number, positive, negative) %>% 
  group_by(outcome) %>% 
  summarise(ratio = number[set == 'predictTraining']/number[set =='training']) %>% 
  mutate(ratio = scales::percent(ratio)) %>% 
  kable(caption = 'Percentage of true predictions for the training dataset') 
Percentage of true predictions for the training dataset
outcome ratio
negative 118.9%
positive 72.2%

7.e) Find the AUC score and plot the ROC curve based on the test set performance.

library(AUC)
# sensibility <- function(x) {
#  tibble(outcome = testDt$famhist,
#        prob = predict(fit, select(trainDt, -famhist), type='response')) %>% 
#   mutate(predicted = ifelse(prob >= x, 1, 0)) %>%
#   count(outcome, predicted) %>% 
#   complete(outcome = c(0,1), predicted = c(0,1), fill = list(n = 0)) %>% 
#   group_by(outcome) %>% 
#   summarise(ratio = n[predicted == outcome]/ sum(n))
#   }
# seq(0, 1, .01) %>% map(sensibility) -> tt
# tibble(truePositive = unlist(tt %>% map('ratio') %>% map(`[`,1)),
#        falsePositive = 1 - unlist(tt %>% map('ratio') %>% map(`[`,2))) -> dd

dtROC <- roc(predict(fit, select(testDt, -famhist), type='response'), factor(testDt$famhist))
ggplot() +
  geom_line(aes(x = dtROC$fpr, y = dtROC$tpr)) +
  geom_segment(aes(x = 0, y = 0, xend = 1, yend = 1, colour = "segment")) +
  ggtitle('ROC curve') +
  xlab('False positive - 1-sensitivity') +
  ylab('True Positive - specificity') +
  theme(legend.position = 'none')

#AUC score
auc(dtROC)
## [1] 0.703125