#Jeff Nieman R Homework 4
#Create a new data frame with the Forbes data found in the Vincent Arelbundock list. The Forbes2000 list looks at
forbesfile <- "https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/HSAUR/Forbes2000.csv"
forbes <- read.table (file = forbesfile, header = TRUE, sep=',')
#Test data frame
head(forbes, 40)
## X rank name country
## 1 1 1 Citigroup United States
## 2 2 2 General Electric United States
## 3 3 3 American Intl Group United States
## 4 4 4 ExxonMobil United States
## 5 5 5 BP United Kingdom
## 6 6 6 Bank of America United States
## 7 7 7 HSBC Group United Kingdom
## 8 8 8 Toyota Motor Japan
## 9 9 9 Fannie Mae United States
## 10 10 10 Wal-Mart Stores United States
## 11 11 11 UBS Switzerland
## 12 12 12 ING Group Netherlands
## 13 13 13 Royal Dutch/Shell Group Netherlands/ United Kingdom
## 14 14 14 Berkshire Hathaway United States
## 15 15 15 JP Morgan Chase United States
## 16 16 16 IBM United States
## 17 17 17 Total France
## 18 18 18 BNP Paribas France
## 19 19 19 Royal Bank of Scotland United Kingdom
## 20 20 20 Freddie Mac United States
## 21 21 21 DaimlerChrysler Germany
## 22 22 22 Altria Group United States
## 23 23 23 ChevronTexaco United States
## 24 24 24 Pfizer United States
## 25 25 25 Wells Fargo United States
## 26 26 26 Verizon Commun United States
## 27 27 27 Barclays United Kingdom
## 28 28 28 Morgan Stanley United States
## 29 29 29 General Motors United States
## 30 30 30 Nippon Tel & Tel Japan
## 31 31 31 Microsoft United States
## 32 32 32 Nestle Switzerland
## 33 33 33 SBC Communications United States
## 34 34 34 Deutsche Bank Group Germany
## 35 35 35 Siemens Group Germany
## 36 36 36 HBOS United Kingdom
## 37 37 37 ENI Italy
## 38 38 38 ConocoPhillips United States
## 39 39 39 Banco Santander Central Spain
## 40 40 40 Merrill Lynch United States
## category sales profits assets marketvalue
## 1 Banking 94.71 17.85 1264.03 255.30
## 2 Conglomerates 134.19 15.59 626.93 328.54
## 3 Insurance 76.66 6.46 647.66 194.87
## 4 Oil & gas operations 222.88 20.96 166.99 277.02
## 5 Oil & gas operations 232.57 10.27 177.57 173.54
## 6 Banking 49.01 10.81 736.45 117.55
## 7 Banking 44.33 6.66 757.60 177.96
## 8 Consumer durables 135.82 7.99 171.71 115.40
## 9 Diversified financials 53.13 6.48 1019.17 76.84
## 10 Retailing 256.33 9.05 104.91 243.74
## 11 Diversified financials 48.95 5.15 853.23 85.07
## 12 Diversified financials 94.72 4.73 752.49 54.59
## 13 Oil & gas operations 133.50 8.40 100.72 163.45
## 14 Insurance 56.22 6.95 172.24 141.14
## 15 Banking 44.39 4.47 792.70 81.94
## 16 Technology hardware & equipment 89.13 7.58 104.46 171.54
## 17 Oil & gas operations 131.64 8.84 87.84 116.64
## 18 Banking 47.74 4.73 745.09 59.29
## 19 Banking 35.65 4.95 663.45 90.21
## 20 Diversified financials 46.26 10.09 752.25 44.25
## 21 Consumer durables 157.13 5.12 195.58 47.43
## 22 Food drink & tobacco 60.70 9.20 96.18 111.02
## 23 Oil & gas operations 112.94 7.43 82.36 92.49
## 24 Drugs & biotechnology 40.36 6.20 120.06 285.27
## 25 Banking 31.80 6.20 387.80 97.53
## 26 Telecommunications services 67.75 2.57 165.97 103.97
## 27 Banking 33.69 4.90 791.54 61.33
## 28 Diversified financials 33.00 3.64 580.63 64.81
## 29 Consumer durables 185.52 3.82 450.00 27.47
## 30 Telecommunications services 92.41 2.17 150.87 73.00
## 31 Software & services 34.27 8.88 85.94 287.02
## 32 Food drink & tobacco 64.56 5.48 62.15 106.55
## 33 Telecommunications services 39.16 5.97 100.17 82.93
## 34 Diversified financials 58.85 1.53 792.49 50.23
## 35 Conglomerates 86.62 2.81 85.47 75.77
## 36 Banking 32.68 3.09 571.76 52.87
## 37 Oil & gas operations 53.29 4.82 67.91 76.13
## 38 Oil & gas operations 90.49 4.83 81.95 46.72
## 39 Banking 28.70 3.28 442.24 56.78
## 40 Diversified financials 26.64 3.47 485.77 57.52
#Add new field "profitability defined by profits/sales
forbes <- transform(forbes, profitability=profits/sales)
head(forbes, 40)
## X rank name country
## 1 1 1 Citigroup United States
## 2 2 2 General Electric United States
## 3 3 3 American Intl Group United States
## 4 4 4 ExxonMobil United States
## 5 5 5 BP United Kingdom
## 6 6 6 Bank of America United States
## 7 7 7 HSBC Group United Kingdom
## 8 8 8 Toyota Motor Japan
## 9 9 9 Fannie Mae United States
## 10 10 10 Wal-Mart Stores United States
## 11 11 11 UBS Switzerland
## 12 12 12 ING Group Netherlands
## 13 13 13 Royal Dutch/Shell Group Netherlands/ United Kingdom
## 14 14 14 Berkshire Hathaway United States
## 15 15 15 JP Morgan Chase United States
## 16 16 16 IBM United States
## 17 17 17 Total France
## 18 18 18 BNP Paribas France
## 19 19 19 Royal Bank of Scotland United Kingdom
## 20 20 20 Freddie Mac United States
## 21 21 21 DaimlerChrysler Germany
## 22 22 22 Altria Group United States
## 23 23 23 ChevronTexaco United States
## 24 24 24 Pfizer United States
## 25 25 25 Wells Fargo United States
## 26 26 26 Verizon Commun United States
## 27 27 27 Barclays United Kingdom
## 28 28 28 Morgan Stanley United States
## 29 29 29 General Motors United States
## 30 30 30 Nippon Tel & Tel Japan
## 31 31 31 Microsoft United States
## 32 32 32 Nestle Switzerland
## 33 33 33 SBC Communications United States
## 34 34 34 Deutsche Bank Group Germany
## 35 35 35 Siemens Group Germany
## 36 36 36 HBOS United Kingdom
## 37 37 37 ENI Italy
## 38 38 38 ConocoPhillips United States
## 39 39 39 Banco Santander Central Spain
## 40 40 40 Merrill Lynch United States
## category sales profits assets marketvalue
## 1 Banking 94.71 17.85 1264.03 255.30
## 2 Conglomerates 134.19 15.59 626.93 328.54
## 3 Insurance 76.66 6.46 647.66 194.87
## 4 Oil & gas operations 222.88 20.96 166.99 277.02
## 5 Oil & gas operations 232.57 10.27 177.57 173.54
## 6 Banking 49.01 10.81 736.45 117.55
## 7 Banking 44.33 6.66 757.60 177.96
## 8 Consumer durables 135.82 7.99 171.71 115.40
## 9 Diversified financials 53.13 6.48 1019.17 76.84
## 10 Retailing 256.33 9.05 104.91 243.74
## 11 Diversified financials 48.95 5.15 853.23 85.07
## 12 Diversified financials 94.72 4.73 752.49 54.59
## 13 Oil & gas operations 133.50 8.40 100.72 163.45
## 14 Insurance 56.22 6.95 172.24 141.14
## 15 Banking 44.39 4.47 792.70 81.94
## 16 Technology hardware & equipment 89.13 7.58 104.46 171.54
## 17 Oil & gas operations 131.64 8.84 87.84 116.64
## 18 Banking 47.74 4.73 745.09 59.29
## 19 Banking 35.65 4.95 663.45 90.21
## 20 Diversified financials 46.26 10.09 752.25 44.25
## 21 Consumer durables 157.13 5.12 195.58 47.43
## 22 Food drink & tobacco 60.70 9.20 96.18 111.02
## 23 Oil & gas operations 112.94 7.43 82.36 92.49
## 24 Drugs & biotechnology 40.36 6.20 120.06 285.27
## 25 Banking 31.80 6.20 387.80 97.53
## 26 Telecommunications services 67.75 2.57 165.97 103.97
## 27 Banking 33.69 4.90 791.54 61.33
## 28 Diversified financials 33.00 3.64 580.63 64.81
## 29 Consumer durables 185.52 3.82 450.00 27.47
## 30 Telecommunications services 92.41 2.17 150.87 73.00
## 31 Software & services 34.27 8.88 85.94 287.02
## 32 Food drink & tobacco 64.56 5.48 62.15 106.55
## 33 Telecommunications services 39.16 5.97 100.17 82.93
## 34 Diversified financials 58.85 1.53 792.49 50.23
## 35 Conglomerates 86.62 2.81 85.47 75.77
## 36 Banking 32.68 3.09 571.76 52.87
## 37 Oil & gas operations 53.29 4.82 67.91 76.13
## 38 Oil & gas operations 90.49 4.83 81.95 46.72
## 39 Banking 28.70 3.28 442.24 56.78
## 40 Diversified financials 26.64 3.47 485.77 57.52
## profitability
## 1 0.18847007
## 2 0.11617855
## 3 0.08426820
## 4 0.09404164
## 5 0.04415875
## 6 0.22056723
## 7 0.15023686
## 8 0.05882786
## 9 0.12196499
## 10 0.03530605
## 11 0.10520940
## 12 0.04993666
## 13 0.06292135
## 14 0.12362149
## 15 0.10069836
## 16 0.08504432
## 17 0.06715284
## 18 0.09907834
## 19 0.13884993
## 20 0.21811500
## 21 0.03258448
## 22 0.15156507
## 23 0.06578714
## 24 0.15361744
## 25 0.19496855
## 26 0.03793358
## 27 0.14544375
## 28 0.11030303
## 29 0.02059077
## 30 0.02348231
## 31 0.25911876
## 32 0.08488228
## 33 0.15245148
## 34 0.02599830
## 35 0.03244054
## 36 0.09455324
## 37 0.09044849
## 38 0.05337606
## 39 0.11428571
## 40 0.13025526
#Set up ggplot for the analysis
require(ggplot2)
## Loading required package: ggplot2
# Create histogram for profits in the Forbes 2000 list
h <- hist(forbes$profits)

print(h)
## $breaks
## [1] -30 -25 -20 -15 -10 -5 0 5 10 15 20 25
##
## $counts
## [1] 1 2 2 2 4 279 1673 26 3 2 1
##
## $density
## [1] 0.0001002506 0.0002005013 0.0002005013 0.0002005013 0.0004010025
## [6] 0.0279699248 0.1677192982 0.0026065163 0.0003007519 0.0002005013
## [11] 0.0001002506
##
## $mids
## [1] -27.5 -22.5 -17.5 -12.5 -7.5 -2.5 2.5 7.5 12.5 17.5 22.5
##
## $xname
## [1] "forbes$profits"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
#Conclusion #1: - most companies are between $0 and $5B in profits
#Look at profitability
h1 <- hist(forbes$profitability)

print(h1)
## $breaks
## [1] -4 -2 0 2 4 6 8 10 12 14 16 18 20 22 24 26
##
## $counts
## [1] 2 288 1702 2 0 0 0 0 0 0 0 0 0 0
## [15] 1
##
## $density
## [1] 0.0005012531 0.0721804511 0.4265664160 0.0005012531 0.0000000000
## [6] 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000
## [11] 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0002506266
##
## $mids
## [1] -3 -1 1 3 5 7 9 11 13 15 17 19 21 23 25
##
## $xname
## [1] "forbes$profitability"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
# Create scatter plot for sales vs. profits
p <- plot(sales~profits, data=forbes)

print(p)
## NULL
#Conclusion #2: - generally profits go up as sales do - but there is no perfect trend. Need to look at profitability vs. sales
p1<- plot(sales~profitability,data=forbes)

print(p1)
## NULL
#Conclusion #3: - the companies with the highest sales seem to be in the middle of the profitablity distribution
# Create box plot for profits
b <- boxplot(forbes$profits)

print(b)
## $stats
## [,1]
## [1,] -0.46
## [2,] 0.08
## [3,] 0.20
## [4,] 0.44
## [5,] 0.98
##
## $n
## [1] 1995
##
## $conf
## [,1]
## [1,] 0.1872653
## [2,] 0.2127347
##
## $out
## [1] 17.85 15.59 6.46 20.96 10.27 10.81 6.66 7.99 6.48 9.05
## [11] 5.15 4.73 8.40 6.95 4.47 7.58 8.84 4.73 4.95 10.09
## [21] 5.12 9.20 7.43 6.20 6.20 2.57 4.90 3.64 3.82 2.17
## [31] 8.88 5.48 5.97 1.53 2.81 3.09 4.82 4.83 3.28 3.47
## [41] 4.25 2.65 2.54 5.81 5.95 6.74 2.87 3.98 3.61 3.00
## [51] 4.19 3.40 1.00 1.61 5.67 2.54 2.81 5.64 2.24 4.04
## [61] 2.47 2.73 7.33 5.40 2.11 2.24 3.88 1.12 1.64 3.24
## [71] 2.71 1.40 1.14 2.12 1.19 3.73 2.48 2.28 6.34 1.94
## [81] 4.52 3.59 1.92 2.69 3.81 1.45 4.35 2.13 1.10 1.89
## [91] 4.24 3.54 2.36 1.74 1.63 1.36 2.05 1.70 4.35 1.47
## [101] 3.49 1.88 2.29 2.05 1.22 1.58 3.96 2.65 1.89 1.65
## [111] 1.85 2.36 2.90 3.29 1.49 2.44 2.13 1.11 2.17 1.36
## [121] 2.29 3.29 1.56 1.57 1.10 1.34 1.67 1.61 1.70 1.42
## [131] 1.01 1.09 1.83 1.45 1.38 0.99 2.12 1.49 1.79 2.57
## [141] 1.53 1.07 1.86 1.60 2.37 1.05 2.40 2.31 2.56 1.83
## [151] 1.67 1.27 1.19 3.40 1.04 1.40 1.33 1.53 1.77 1.21
## [161] 2.26 1.04 1.83 1.33 1.89 1.64 1.33 1.15 1.31 2.37
## [171] 1.39 1.16 1.20 1.10 2.08 2.04 1.60 1.62 1.14 1.06
## [181] 3.04 1.32 1.06 1.36 4.45 1.12 1.10 2.30 1.06 1.20
## [191] 1.64 1.11 2.49 0.99 1.48 1.39 1.88 1.00 1.05 1.18
## [201] 1.54 2.20 2.62 1.02 1.40 1.73 1.32 1.02 1.56 1.39
## [211] 1.11 1.20 1.42 1.25 1.29 -1.23 -15.51 1.25 -25.83 1.36
## [221] -2.40 -21.78 -0.79 -3.94 -0.73 -1.37 -0.86 -20.11 -3.96 -5.86
## [231] -0.81 1.05 -0.91 -0.87 -5.10 1.10 -5.15 -3.37 1.03 1.14
## [241] -1.16 -1.93 1.02 -1.03 1.14 -1.27 -0.72 -2.19 -4.15 -4.98
## [251] -7.09 1.15 1.12 -10.02 -0.62 -1.10 -0.81 -1.80 -1.51 -0.93
## [261] -0.66 -2.02 -4.09 -16.03 -1.45 -1.23 -0.50 -0.62 -3.22 -0.98
## [271] -0.51 -0.73 -0.57 -1.50 -10.32 -1.79 -0.50 -0.74 -1.65 -1.00
## [281] -0.77 -0.82 -3.51 -0.74 -0.65 3.27 -0.85 -0.81 -3.78 -2.00
## [291] -3.57 1.11 -0.54 -0.96 -1.44 -1.21 -1.99 1.24 -0.71 -0.57
## [301] -0.76 -1.01 -0.77 -1.99 -0.96 -1.22 -2.28 -4.45 -0.47 -1.65
## [311] -0.48 -0.48 -0.72 -0.47 -0.71 -0.87 -0.91 -0.77 -0.62 -0.87
## [321] -1.00 -2.83 -0.86 -0.53 -0.47 -0.96 -0.95 -2.50 -1.76 -0.56
## [331] -0.62 -1.48 -1.52 -1.72 -3.62
##
## $group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [246] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [281] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [316] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## $names
## [1] "1"
summary(forbes$profits)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -25.8300 0.0800 0.2000 0.3811 0.4400 20.9600 5
#Conclusion #4: - nearly all of the first quartile loses money and all of the bottom 3 quartiles are under $0.44B in profits
b1 <- boxplot(forbes$sales)

print(b1)
## $stats
## [,1]
## [1,] 0.010
## [2,] 2.015
## [3,] 4.365
## [4,] 9.555
## [5,] 20.650
##
## $n
## [1] 2000
##
## $conf
## [,1]
## [1,] 4.098613
## [2,] 4.631387
##
## $out
## [1] 94.71 134.19 76.66 222.88 232.57 49.01 44.33 135.82 53.13 256.33
## [11] 48.95 94.72 133.50 56.22 44.39 89.13 131.64 47.74 35.65 46.26
## [21] 157.13 60.70 112.94 40.36 31.80 67.75 33.69 33.00 185.52 92.41
## [31] 34.27 64.56 39.16 58.85 86.62 32.68 53.29 90.49 28.70 26.64
## [41] 24.47 38.08 73.06 46.99 50.22 40.01 24.48 23.64 67.44 24.17
## [51] 57.77 21.04 90.10 35.52 29.53 22.84 24.10 30.14 35.79 62.90
## [61] 25.85 32.15 30.78 26.77 38.99 50.70 31.77 41.23 91.33 41.62
## [71] 45.85 52.23 164.20 37.57 25.18 34.16 39.16 63.23 37.05 22.58
## [81] 28.44 96.94 45.68 29.58 32.81 31.03 52.51 32.63 38.17 46.65
## [91] 21.03 26.97 38.22 50.49 21.66 29.14 33.84 41.44 68.23 34.53
## [101] 35.02 41.48 30.42 27.73 22.61 27.06 26.20 22.76 23.10 24.76
## [111] 37.22 21.71 61.30 21.50 30.03 112.76 22.12 24.16 31.82 28.57
## [121] 30.64 23.53 23.94 41.12 21.81 40.57 53.23 24.28 33.74 26.35
## [131] 69.30 42.17 23.56 36.68 54.12 23.37 43.87 111.98 35.97 22.98
## [141] 25.26 23.05 27.54 21.20 28.32 31.73 66.45 26.59 23.85 35.90
## [151] 26.14 27.54 21.94 23.09 78.08 22.77 47.85 34.51 32.34 26.44
## [161] 25.00 96.88 47.99 34.26 56.40 50.58 38.01 57.99 57.90 29.17
## [171] 37.95 52.46 24.40 47.46 29.84 32.99 62.62 88.51 48.41 40.52
## [181] 32.05 27.53 33.42 36.10 25.25 31.41 74.39 21.58 32.87 22.43
## [191] 39.06 70.57 39.72 23.26 58.22 22.17 30.79 25.77 23.66 35.55
## [201] 21.48 36.59 28.10 21.33 21.80 23.26 54.72 29.89 34.77 21.74
## [211] 25.80 22.57
##
## $group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1
##
## $names
## [1] "1"
summary(forbes$sales)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.010 2.018 4.365 9.697 9.548 256.300
#Conclusion #5: - the spread of sales in the fourth quartile is massive - strong contrast to first 3
#Build layered plot. Wanted to create a way to look at 3 variables together.
g<-ggplot(forbes, aes(x=sales, y=profits))
g+geom_point(aes(color=assets))
## Warning: Removed 5 rows containing missing values (geom_point).

#Conclusion #6: - no great surprise but the lower sales lower profit companies tend to have fewer assets
# Add colors and test a few possibilities that are discrete non-numerical values
g+geom_point(aes(color=category))
## Warning: Removed 5 rows containing missing values (geom_point).

g<-ggplot(forbes, aes(x=sales, y=profitability))
g+geom_point(aes(color=category))
## Warning: Removed 5 rows containing missing values (geom_point).

g<-ggplot(forbes, aes(x=sales, y=profits))
g+geom_point(aes(color=country))
## Warning: Removed 5 rows containing missing values (geom_point).

g<-ggplot(forbes, aes(x=sales, y=profitability))
g+geom_point(aes(color=country))
## Warning: Removed 5 rows containing missing values (geom_point).

g<-ggplot(forbes, aes(x=country, y=profitability))
g+geom_point(aes(color=assets))
## Warning: Removed 5 rows containing missing values (geom_point).

#Conclusion #7 - JApan and the UK seem to have a disproportionate amount if unprofitable companies on the list