mydata <- read.csv("Fortune 500 Companies.csv",
header = TRUE,
sep = ",",
quote = "\"",
stringsAsFactors = FALSE)
head(mydata)
## name rank year industry
## 1 Walmart 1 2023 General Merchandisers
## 2 Amazon 2 2023 Internet Services and Retailing
## 3 Exxon Mobil 3 2023 Petroleum Refining
## 4 Apple 4 2023 Computers, Office Equipment
## 5 UnitedHealth Group 5 2023 Health Care: Insurance and Managed Care
## 6 CVS Health 6 2023 Health Care: Pharmacy and Other Services
## sector headquarters_state headquarters_city market_value_mil revenue_mil
## 1 Retailing AR Bentonville 397475 611289
## 2 Retailing WA Seattle 1058440 513983
## 3 Energy TX Irving 446424 413680
## 4 Technology CA Cupertino 2609039 394328
## 5 Health Care MN Minnetonka 440854 324162
## 6 Health Care RI Woonsocket 95422 322467
## profit_mil asset_mil employees founder_is_ceo female_ceo
## 1 11680 243197 2100000 no no
## 2 2722 462675 1541000 no no
## 3 55740 369067 62000 no no
## 4 99803 352755 164000 no no
## 5 20120 245705 400000 no no
## 6 4149 228275 259500 no yes
## newcomer_to_fortune_500 global_500
## 1 no yes
## 2 no yes
## 3 no yes
## 4 no yes
## 5 no yes
## 6 no yes
#Comment: This data shows Fortune 500 companies in FY2023. The Fortune 500 is an annual list compiled and published by Fortune magazine that ranks the largest 500 U.S. companies by total revenue.
names(mydata)
## [1] "name" "rank"
## [3] "year" "industry"
## [5] "sector" "headquarters_state"
## [7] "headquarters_city" "market_value_mil"
## [9] "revenue_mil" "profit_mil"
## [11] "asset_mil" "employees"
## [13] "founder_is_ceo" "female_ceo"
## [15] "newcomer_to_fortune_500" "global_500"
summary(mydata[ , sapply(mydata, is.numeric)])
## rank year market_value_mil revenue_mil
## Min. : 1.0 Min. :2023 Min. : 54 Min. : 7238
## 1st Qu.:125.8 1st Qu.:2023 1st Qu.: 8754 1st Qu.: 10652
## Median :250.5 Median :2023 Median : 23200 Median : 16524
## Mean :250.5 Mean :2023 Mean : 69702 Mean : 36289
## 3rd Qu.:375.2 3rd Qu.:2023 3rd Qu.: 58412 3rd Qu.: 32043
## Max. :500.0 Max. :2023 Max. :2609039 Max. :611289
## NA's :24
## profit_mil asset_mil employees
## Min. : 5.0 Min. : 1443 Min. : 381
## 1st Qu.: 692.8 1st Qu.: 13312 1st Qu.: 12602
## Median : 1427.5 Median : 28318 Median : 26658
## Mean : 3612.2 Mean : 110941 Mean : 60815
## 3rd Qu.: 3328.0 3rd Qu.: 73079 3rd Qu.: 60050
## Max. :99803.0 Max. :4305288 Max. :2100000
##
#Proportions of companies by industries
prop.table(table(mydata$sector)) * 100
##
## Aerospace & Defense Apparel
## 1.6 0.8
## Business Services Chemicals
## 3.8 3.2
## Energy Engineering & Construction
## 13.2 2.6
## Financials Food & Drug Stores
## 17.2 1.0
## Food, Beverages & Tobacco Health Care
## 4.8 8.0
## Hotels, Restaurants & Leisure Household Products
## 1.8 1.6
## Industrials Materials
## 3.4 4.6
## Media Motor Vehicles & Parts
## 1.8 2.0
## Retailing Technology
## 9.6 9.8
## Telecommunications Transportation
## 1.4 4.0
## Wholesalers
## 3.8
#Proportions of companies by states
prop.table(table(mydata$headquarters_state)) * 100
##
## AL AR AZ CA CO CT DC DE FL GA IA ID IL IN KS KY
## 0.4 0.8 2.0 10.6 2.0 2.8 0.4 0.2 4.6 3.8 0.4 0.6 6.6 1.4 0.2 0.2
## LA MA MD MI MN MO NC NE NJ NV NY OH OK OR PA RI
## 0.4 3.4 0.6 3.6 3.0 1.6 2.6 0.8 2.8 0.4 10.0 4.8 1.2 0.4 4.6 0.8
## SC TN TX VA WA WI
## 0.2 2.0 11.0 4.8 2.4 1.6
#Share of companies with the Founder being the CEO
prop.table(table(mydata$founder_is_ceo)) * 100
##
## no yes
## 96.4 3.6
#Share of companies with female CEOs
prop.table(table(mydata$female_ceo)) * 100
##
## no yes
## 89.6 10.4
#Share of Newcomers to the list
prop.table(table(mydata$newcomer_to_fortune_500)) * 100
##
## no yes
## 94.4 5.6
#Share of companies being in the Global 500 list as well
prop.table(table(mydata$global_500)) * 100
##
## no yes
## 75.6 24.4
head(mydata)
## name rank year industry
## 1 Walmart 1 2023 General Merchandisers
## 2 Amazon 2 2023 Internet Services and Retailing
## 3 Exxon Mobil 3 2023 Petroleum Refining
## 4 Apple 4 2023 Computers, Office Equipment
## 5 UnitedHealth Group 5 2023 Health Care: Insurance and Managed Care
## 6 CVS Health 6 2023 Health Care: Pharmacy and Other Services
## sector headquarters_state headquarters_city market_value_mil revenue_mil
## 1 Retailing AR Bentonville 397475 611289
## 2 Retailing WA Seattle 1058440 513983
## 3 Energy TX Irving 446424 413680
## 4 Technology CA Cupertino 2609039 394328
## 5 Health Care MN Minnetonka 440854 324162
## 6 Health Care RI Woonsocket 95422 322467
## profit_mil asset_mil employees founder_is_ceo female_ceo
## 1 11680 243197 2100000 no no
## 2 2722 462675 1541000 no no
## 3 55740 369067 62000 no no
## 4 99803 352755 164000 no no
## 5 20120 245705 400000 no no
## 6 4149 228275 259500 no yes
## newcomer_to_fortune_500 global_500
## 1 no yes
## 2 no yes
## 3 no yes
## 4 no yes
## 5 no yes
## 6 no yes
#New Variable Profit Margin
mydata$profit_margin_percentage <- round(mydata$profit_mil / mydata$revenue_mil *100, 1)
head(mydata)
## name rank year industry
## 1 Walmart 1 2023 General Merchandisers
## 2 Amazon 2 2023 Internet Services and Retailing
## 3 Exxon Mobil 3 2023 Petroleum Refining
## 4 Apple 4 2023 Computers, Office Equipment
## 5 UnitedHealth Group 5 2023 Health Care: Insurance and Managed Care
## 6 CVS Health 6 2023 Health Care: Pharmacy and Other Services
## sector headquarters_state headquarters_city market_value_mil revenue_mil
## 1 Retailing AR Bentonville 397475 611289
## 2 Retailing WA Seattle 1058440 513983
## 3 Energy TX Irving 446424 413680
## 4 Technology CA Cupertino 2609039 394328
## 5 Health Care MN Minnetonka 440854 324162
## 6 Health Care RI Woonsocket 95422 322467
## profit_mil asset_mil employees founder_is_ceo female_ceo
## 1 11680 243197 2100000 no no
## 2 2722 462675 1541000 no no
## 3 55740 369067 62000 no no
## 4 99803 352755 164000 no no
## 5 20120 245705 400000 no no
## 6 4149 228275 259500 no yes
## newcomer_to_fortune_500 global_500 profit_margin_percentage
## 1 no yes 1.9
## 2 no yes 0.5
## 3 no yes 13.5
## 4 no yes 25.3
## 5 no yes 6.2
## 6 no yes 1.3
#New Variable - approx. EV/Revenue
mydata$EVRevenue <- round(mydata$market_value_mil / mydata$revenue_mil, 1)
head(mydata)
## name rank year industry
## 1 Walmart 1 2023 General Merchandisers
## 2 Amazon 2 2023 Internet Services and Retailing
## 3 Exxon Mobil 3 2023 Petroleum Refining
## 4 Apple 4 2023 Computers, Office Equipment
## 5 UnitedHealth Group 5 2023 Health Care: Insurance and Managed Care
## 6 CVS Health 6 2023 Health Care: Pharmacy and Other Services
## sector headquarters_state headquarters_city market_value_mil revenue_mil
## 1 Retailing AR Bentonville 397475 611289
## 2 Retailing WA Seattle 1058440 513983
## 3 Energy TX Irving 446424 413680
## 4 Technology CA Cupertino 2609039 394328
## 5 Health Care MN Minnetonka 440854 324162
## 6 Health Care RI Woonsocket 95422 322467
## profit_mil asset_mil employees founder_is_ceo female_ceo
## 1 11680 243197 2100000 no no
## 2 2722 462675 1541000 no no
## 3 55740 369067 62000 no no
## 4 99803 352755 164000 no no
## 5 20120 245705 400000 no no
## 6 4149 228275 259500 no yes
## newcomer_to_fortune_500 global_500 profit_margin_percentage EVRevenue
## 1 no yes 1.9 0.7
## 2 no yes 0.5 2.1
## 3 no yes 13.5 1.1
## 4 no yes 25.3 6.6
## 5 no yes 6.2 1.4
## 6 no yes 1.3 0.3
#Profit Margin interpretation:
summary(mydata$profit_margin_percentage)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.100 3.875 8.600 11.313 15.100 115.100
#Identification and removal of 115.1%
mydata[mydata$profit_margin_percentage > 100, c("name", "profit_margin_percentage", "profit_mil", "revenue_mil")]
## name profit_margin_percentage profit_mil
## 283 Fidelity National Information Services 115.1 16720
## revenue_mil
## 283 14528
#After checking - Fidelity did record a loss in FY2023 but not as substantial as shown here, therefore I will remove the unit entirely
mydata2 <- mydata[mydata$name != "Fidelity National Information Services", ]
#Profit Margin interpretation nr. 2: Profit Margin ranged from 0.1% for Kohls, a retailer, and up to 51.0% for VISA Inc., which is a substantial amount, however quite expected based on their historical performance, business model and market structure of bank card providers/payment solutions. Median was 8.6%, which means that 50% of companies (in mydata2) had a profit margin of 8.6% or less and 50% of companies had a higher profit margin. Another important parameter here is the 3rd quartile at 15.05%, which shows that 75% of comapnies had the profit m. of 15.05% or less and 25% had a higher pr. margin. This leads us to believe that we are dealing with a distribution thats skewed to the right.
summary(mydata2$profit_margin_percentage)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.10 3.85 8.60 11.10 15.05 51.00
mydata2[mydata2$profit_margin_percentage > 50, c("name", "profit_margin_percentage", "profit_mil", "revenue_mil")]
## name profit_margin_percentage profit_mil revenue_mil
## 137 Visa 51 14957 29310
#EV/Revenue multiple interpretation: (24 NA's explained below) The minimum of (approx.) EV/Revenue multiple stands at 0.000x (rounded) for some companies in sectors that are known for enormously high revenue (and low perceived market value), such as Financials (esp. Fannie Mae & Freddie Mac), Energy and Retail (see below for these companies). The maximum (highest multiple) is at 25.4x, which means that one company (Nvidia) had its Market Cap 25.4 times larger than its Revenue. Third quartile stands at 2.5x, which means that we are likely looking at a distribution thats skewed to the right once again. 75% of companies had an EV/Revenue multiple lower or the same as 2.5x (see below for the companies with higher multiple than 2.5x - highest 25%).
summary(mydata2$EVRevenue)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.550 1.300 2.074 2.500 25.400 24
#Show rows with NAs
#Comment: The Fortune 500 is an annual list compiled and published by Fortune magazine that ranks the largest U.S. companies by total revenue. Therefore, companies in this list are not necessarily publicly traded, i.e. there is not necessarily transparent data which could clearly show market_value_mil (Market capitalization), although there are some exceptions to this rule - private companies for which the analysts also calculate market value - EV and Equity Value (Equity Value = approx. Mkt Cap).
mydata2[!complete.cases(mydata2), ]
## name rank year
## 44 State Farm Insurance 44 2023
## 71 New York Life Insurance 71 2023
## 78 Publix Super Markets 78 2023
## 83 Nationwide 83 2023
## 86 Liberty Mutual Insurance Group 86 2023
## 90 CHS 90 2023
## 103 TIAA 103 2023
## 104 Massachusetts Mutual Life Insurance 104 2023
## 111 Northwestern Mutual 111 2023
## 114 USAA 114 2023
## 213 Land O'Lakes 213 2023
## 281 Guardian Life Ins. Co. of America 281 2023
## 295 Farmers Insurance Exchange 295 2023
## 301 American Family Insurance Group 301 2023
## 307 Peter Kiewit Sons' 307 2023
## 312 Pacific Life 312 2023
## 314 Western & Southern Financial Group 314 2023
## 328 Mutual of Omaha 328 2023
## 333 Jones Financial (Edward Jones) 333 2023
## 362 Auto-Owners Insurance 362 2023
## 378 Graybar Electric 378 2023
## 407 Continental Resources 407 2023
## 412 Thrivent Financial for Lutherans 412 2023
## 414 Erie Insurance Group 414 2023
## industry sector
## 44 Insurance: Property and Casualty (Mutual) Financials
## 71 Insurance: Life, Health (Mutual) Financials
## 78 Food & Drug Stores Food & Drug Stores
## 83 Insurance: Property and Casualty (Mutual) Financials
## 86 Insurance: Property and Casualty (Stock) Financials
## 90 Food Production Food, Beverages & Tobacco
## 103 Insurance: Life, Health (Mutual) Financials
## 104 Insurance: Life, Health (Mutual) Financials
## 111 Insurance: Life, Health (Mutual) Financials
## 114 Insurance: Property and Casualty (Stock) Financials
## 213 Food Consumer Products Food, Beverages & Tobacco
## 281 Insurance: Life, Health (Mutual) Financials
## 295 Insurance: Property and Casualty (Mutual) Financials
## 301 Insurance: Property and Casualty (Stock) Financials
## 307 Engineering & Construction Engineering & Construction
## 312 Insurance: Life, Health (Stock) Financials
## 314 Insurance: Life, Health (Mutual) Financials
## 328 Insurance: Life, Health (Mutual) Financials
## 333 Securities Financials
## 362 Insurance: Property and Casualty (Mutual) Financials
## 378 Wholesalers: Diversified Wholesalers
## 407 Mining, Crude-Oil Production Energy
## 412 Insurance: Life, Health (Mutual) Financials
## 414 Insurance: Property and Casualty (Mutual) Financials
## headquarters_state headquarters_city market_value_mil revenue_mil
## 44 IL Bloomington NA 89328
## 71 NY New York NA 58445
## 78 FL Lakeland NA 54942
## 83 OH Columbus NA 51450
## 86 MA Boston NA 49956
## 90 MN Inver Grove Heights NA 47792
## 103 NY New York NA 40911
## 104 MA Springfield NA 40281
## 111 WI Milwaukee NA 36921
## 114 TX San Antonio NA 36297
## 213 MN Arden Hills NA 19226
## 281 NY New York NA 14653
## 295 CA Woodland Hills NA 14092
## 301 WI Madison NA 13814
## 307 NE Omaha NA 13663
## 312 CA Newport Beach NA 13360
## 314 OH Cincinnati NA 13156
## 328 NE Omaha NA 12531
## 333 MO Des Peres NA 12411
## 362 MI Lansing NA 11407
## 378 MO St. Louis NA 10534
## 407 OK Oklahoma City NA 9474
## 412 MN Minneapolis NA 9347
## 414 PA Erie NA 9296
## profit_mil asset_mil employees founder_is_ceo female_ceo
## 44 6654 318243 60519 no no
## 71 1127 392126 15050 no no
## 78 2918 31047 242000 no no
## 83 988 264511 24791 no no
## 86 414 160316 50000 no no
## 90 1679 18825 10014 no no
## 103 494 634457 16070 no yes
## 104 1485 381336 10662 no no
## 111 912 340390 8323 no no
## 114 1296 204005 36820 no no
## 213 241 9770 9000 no yes
## 281 71 91620 8025 no no
## 295 314 22979 11344 no no
## 301 1904 36088 12990 no no
## 307 710 7220 25700 no no
## 312 763 199324 3950 no no
## 314 309 69147 2594 no no
## 328 177 45140 6350 no no
## 333 1404 29892 52000 no yes
## 362 55 33219 6733 no no
## 378 453 3749 9400 no yes
## 407 4025 20878 1404 no no
## 412 1085 110100 3692 no yes
## 414 577 26598 5997 no no
## newcomer_to_fortune_500 global_500 profit_margin_percentage EVRevenue
## 44 no yes 7.4 NA
## 71 no yes 1.9 NA
## 78 no yes 5.3 NA
## 83 no yes 1.9 NA
## 86 no yes 0.8 NA
## 90 no yes 3.5 NA
## 103 no yes 1.2 NA
## 104 no yes 3.7 NA
## 111 no yes 2.5 NA
## 114 no yes 3.6 NA
## 213 no no 1.3 NA
## 281 no no 0.5 NA
## 295 no no 2.2 NA
## 301 no no 13.8 NA
## 307 no no 5.2 NA
## 312 no no 5.7 NA
## 314 no no 2.3 NA
## 328 no no 1.4 NA
## 333 no no 11.3 NA
## 362 no no 0.5 NA
## 378 no no 4.3 NA
## 407 yes no 42.5 NA
## 412 no no 11.6 NA
## 414 no no 6.2 NA
#Show low EVRevenue multiples
mydata2[mydata2$EVRevenue < 0.1, c("name","sector", "EVRevenue", "market_value_mil", "revenue_mil")]
## name sector EVRevenue market_value_mil
## 28 Fannie Mae Financials 0 475
## NA <NA> <NA> NA NA
## 45 Freddie Mac Financials 0 266
## 59 StoneX Group Financials 0 2138
## 70 World Fuel Services Energy 0 1585
## NA.1 <NA> <NA> NA NA
## 74 Plains GP Holdings Energy 0 2551
## NA.2 <NA> <NA> NA NA
## NA.3 <NA> <NA> NA NA
## NA.4 <NA> <NA> NA NA
## NA.5 <NA> <NA> NA NA
## NA.6 <NA> <NA> NA NA
## NA.7 <NA> <NA> NA NA
## NA.8 <NA> <NA> NA NA
## NA.9 <NA> <NA> NA NA
## 161 Rite Aid Food & Drug Stores 0 127
## NA.10 <NA> <NA> NA NA
## NA.11 <NA> <NA> NA NA
## NA.12 <NA> <NA> NA NA
## NA.13 <NA> <NA> NA NA
## NA.14 <NA> <NA> NA NA
## NA.15 <NA> <NA> NA NA
## NA.16 <NA> <NA> NA NA
## NA.17 <NA> <NA> NA NA
## NA.18 <NA> <NA> NA NA
## 342 Qurate Retail Retailing 0 406
## NA.19 <NA> <NA> NA NA
## NA.20 <NA> <NA> NA NA
## NA.21 <NA> <NA> NA NA
## NA.22 <NA> <NA> NA NA
## NA.23 <NA> <NA> NA NA
## 467 NGL Energy Partners Energy 0 381
## 472 Bed Bath & Beyond Retailing 0 183
## 491 SVB Financial Group Financials 0 54
## revenue_mil
## 28 121596
## NA NA
## 45 86717
## 59 66036
## 70 59043
## NA.1 NA
## 74 57342
## NA.2 NA
## NA.3 NA
## NA.4 NA
## NA.5 NA
## NA.6 NA
## NA.7 NA
## NA.8 NA
## NA.9 NA
## 161 24568
## NA.10 NA
## NA.11 NA
## NA.12 NA
## NA.13 NA
## NA.14 NA
## NA.15 NA
## NA.16 NA
## NA.17 NA
## NA.18 NA
## 342 12106
## NA.19 NA
## NA.20 NA
## NA.21 NA
## NA.22 NA
## NA.23 NA
## 467 7948
## 472 7868
## 491 7401
#Show top 25%: Usually technology, health care, etc. (industries with high anticipation of value produced in the future)
mydata2[mydata2$EVRevenue > 2.5, c("name","sector", "EVRevenue", "market_value_mil", "revenue_mil")]
## name sector EVRevenue
## 4 Apple Technology 6.6
## 8 Alphabet Technology 4.7
## 13 Microsoft Technology 10.8
## 31 Meta Platforms Technology 4.7
## 40 Johnson & Johnson Health Care 5.1
## NA <NA> <NA> NA
## 46 PepsiCo Food, Beverages & Tobacco 2.9
## 50 Tesla Motor Vehicles & Parts 8.1
## 51 Procter & Gamble Household Products 4.4
## 69 Merck Health Care 4.6
## NA.1 <NA> <NA> NA
## 73 AbbVie Health Care 4.8
## NA.2 <NA> <NA> NA
## 82 Cisco Systems Technology 4.2
## NA.3 <NA> <NA> NA
## NA.4 <NA> <NA> NA
## NA.5 <NA> <NA> NA
## 93 Nike Apparel 4.1
## 95 Bristol-Myers Squibb Health Care 3.2
## 97 Thermo Fisher Scientific Technology 4.9
## 98 Qualcomm Technology 3.2
## 99 Abbott Laboratories Health Care 4.0
## 100 Coca-Cola Food, Beverages & Tobacco 6.2
## 101 Oracle Technology 5.9
## NA.6 <NA> <NA> NA
## NA.7 <NA> <NA> NA
## NA.8 <NA> <NA> NA
## NA.9 <NA> <NA> NA
## 115 Honeywell International Industrials 3.6
## 123 Broadcom Technology 8.1
## 125 Starbucks Hotels, Restaurants & Leisure 3.7
## 128 Philip Morris International Food, Beverages & Tobacco 4.8
## 129 Netflix Media 4.9
## 131 Mondelez International Food, Beverages & Tobacco 3.0
## 132 Danaher Health Care 5.8
## 133 Salesforce Technology 6.4
## 137 Visa Business Services 16.2
## 138 Southern Energy 2.6
## 141 Duke Energy Energy 2.6
## 142 Eli Lilly Health Care 11.4
## 148 PayPal Holdings Business Services 3.1
## 150 Gilead Sciences Health Care 3.8
## 152 Nvidia Technology 25.4
## 154 Amgen Health Care 4.9
## 155 Applied Materials Technology 4.0
## 156 EOG Resources Energy 2.6
## 159 Union Pacific Transportation 5.0
## 167 Advanced Micro Devices Technology 6.7
## 169 McDonald's Hotels, Restaurants & Leisure 8.8
## 171 Freeport-McMoRan Energy 2.6
## 175 Charles Schwab Financials 4.3
## 177 Mastercard Business Services 15.6
## 178 Sherwin-Williams Chemicals 2.6
## 187 NextEra Energy Energy 7.3
## 193 Marsh & McLennan Financials 4.0
## 194 Altria Group Food, Beverages & Tobacco 3.9
## 200 Texas Instruments Technology 8.4
## 204 Waste Management Business Services 3.4
## 209 Becton Dickinson Health Care 3.6
## 211 Moderna Health Care 3.1
## NA.10 <NA> <NA> NA
## 219 General Mills Food, Beverages & Tobacco 2.6
## 224 Stryker Health Care 5.9
## 228 Colgate-Palmolive Household Products 3.5
## 229 BlackRock Financials 5.6
## 230 Fiserv Business Services 4.0
## 231 Estee Lauder Household Products 5.0
## 233 Adobe Technology 10.0
## 240 Lam Research Technology 4.2
## 242 Dominion Energy Energy 2.7
## 243 Booking Holdings Technology 5.8
## 251 Automatic Data Processing Business Services 5.6
## 258 AutoZone Retailing 2.8
## 260 Illinois Tool Works Industrials 4.7
## 261 Parker-Hannifin Industrials 2.7
## 279 CSX Transportation 4.1
## NA.11 <NA> <NA> NA
## 285 Sempra Energy 3.3
## 286 IQVIA Holdings Health Care 2.6
## 287 O'Reilly Automotive Retailing 3.6
## 293 Ecolab Chemicals 3.3
## NA.12 <NA> <NA> NA
## 296 Keurig Dr Pepper Food, Beverages & Tobacco 3.5
## NA.13 <NA> <NA> NA
## 306 Otis Worldwide Industrials 2.6
## NA.14 <NA> <NA> NA
## 309 Republic Services Business Services 3.2
## NA.15 <NA> <NA> NA
## 313 VMware Technology 4.0
## NA.16 <NA> <NA> NA
## 320 Norfolk Southern Transportation 3.8
## 321 Intuit Technology 9.8
## 322 Air Products & Chemicals Chemicals 5.0
## 323 Boston Scientific Health Care 5.7
## 326 Amphenol Technology 3.8
## NA.17 <NA> <NA> NA
## NA.18 <NA> <NA> NA
## 339 Regeneron Pharmaceuticals Health Care 7.4
## 344 Analog Devices Technology 8.3
## 348 Newmont Energy 3.3
## 356 Apollo Global Management Financials 3.1
## 358 Hess Energy 3.5
## NA.19 <NA> <NA> NA
## 366 S&P Global Business Services 10.1
## 370 Williams Energy 3.3
## 373 American Tower Financials 8.9
## NA.20 <NA> <NA> NA
## 380 Hershey Food, Beverages & Tobacco 5.0
## 383 Ulta Beauty Retailing 2.7
## 386 Biogen Health Care 3.9
## 392 Public Service Enterprise Group Energy 3.2
## 400 Diamondback Energy Energy 2.6
## 401 Intercontinental Exchange Financials 6.1
## 404 WEC Energy Group Energy 3.1
## 405 Yum China Holdings Hotels, Restaurants & Leisure 2.8
## NA.21 <NA> <NA> NA
## NA.22 <NA> <NA> NA
## NA.23 <NA> <NA> NA
## 416 KLA Technology 6.0
## 418 Motorola Solutions Technology 5.3
## 425 Global Payments Business Services 3.1
## 427 Vertex Pharmaceuticals Health Care 9.1
## 430 Constellation Brands Food, Beverages & Tobacco 4.7
## 431 Hilton Worldwide Holdings Hotels, Restaurants & Leisure 4.3
## 438 Chipotle Mexican Grill Hotels, Restaurants & Leisure 5.5
## 443 Arthur J. Gallagher Financials 4.8
## 444 Blackstone Financials 7.3
## 450 Airbnb Technology 9.5
## 454 ON Semiconductor Technology 4.3
## 461 Lululemon athletica Retailing 5.7
## 463 Zoetis Health Care 9.5
## 470 PPL Energy 2.6
## 473 Cintas Business Services 6.0
## 476 Rockwell Automation Industrials 4.3
## 480 Ameren Energy 3.0
## 484 Activision Blizzard Media 8.9
## 493 Albemarle Chemicals 3.5
## 494 Vulcan Materials Materials 3.1
## 496 KKR Financials 6.2
## 497 Equinix Financials 9.2
## 499 ServiceNow Technology 13.0
## market_value_mil revenue_mil
## 4 2609039 394328
## 8 1330201 282836
## 13 2146049 198270
## 31 549484 116609
## 40 483576 94943
## NA NA NA
## 46 251085 86392
## 50 656425 81462
## 51 350781 80187
## 69 270081 59283
## NA.1 NA NA
## 73 281151 58054
## NA.2 NA NA
## 82 214109 51557
## NA.3 NA NA
## NA.4 NA NA
## NA.5 NA NA
## 93 190161 46710
## 95 145780 46159
## 97 222150 44915
## 98 142252 44200
## 99 175984 43653
## 100 268361 43004
## 101 250866 42440
## NA.6 NA NA
## NA.7 NA NA
## NA.8 NA NA
## NA.9 NA NA
## 115 127695 35466
## 123 267473 33203
## 125 119677 32250
## 128 150946 31762
## 129 153858 31616
## 131 95050 31496
## 132 183764 31471
## 133 199780 31352
## 137 475307 29310
## 138 75948 29279
## 141 74344 28784
## 142 326351 28541
## 148 85917 27518
## 150 103614 27281
## 152 686092 26974
## 154 129089 26323
## 155 103806 25785
## 156 67371 25702
## 159 123146 24875
## 167 157738 23601
## 169 204534 23183
## 171 58530 22780
## 175 95264 22307
## 177 346418 22237
## 178 58090 22149
## 187 153196 20956
## 193 82371 20720
## 194 79672 20688
## 200 168775 20028
## 204 66372 19698
## 209 70277 19408
## 211 59232 19263
## NA.10 NA NA
## 219 50195 18993
## 224 108228 18449
## 228 62535 17967
## 229 100526 17873
## 230 70997 17737
## 231 88041 17737
## 233 176769 17606
## 240 71532 17227
## 242 46699 17174
## 243 99859 17090
## 251 92247 16498
## 258 45224 16252
## 260 74209 15932
## 261 43112 15862
## 279 61330 14853
## NA.11 NA NA
## 285 47563 14439
## 286 37022 14410
## 287 52269 14410
## 293 47121 14188
## NA.12 NA NA
## 296 49620 14057
## NA.13 NA NA
## 306 35015 13685
## NA.14 NA NA
## 309 42763 13511
## NA.15 NA NA
## 313 53496 13350
## NA.16 NA NA
## 320 48257 12745
## 321 125076 12726
## 322 63784 12699
## 323 71910 12682
## 326 48591 12623
## NA.17 NA NA
## NA.18 NA NA
## 339 89830 12173
## 344 99764 12014
## 348 38947 11915
## 356 36042 11627
## 358 40520 11570
## NA.19 NA NA
## 366 113066 11181
## 370 36394 10965
## 373 95150 10711
## NA.20 NA NA
## 380 51909 10419
## 383 27390 10209
## 386 40171 10173
## 392 31148 9800
## 400 24816 9643
## 401 58372 9636
## 404 29900 9597
## 405 26517 9569
## NA.21 NA NA
## NA.22 NA NA
## NA.23 NA NA
## 416 55277 9212
## 418 47917 9112
## 425 27761 8976
## 427 81002 8931
## 430 41688 8821
## 431 37535 8773
## 438 47186 8635
## 443 40955 8551
## 444 62048 8518
## 450 79682 8399
## 454 35560 8326
## 461 46312 8111
## 463 77053 8080
## 470 20472 7902
## 473 47042 7855
## 476 33683 7760
## 480 22675 7662
## 484 67126 7528
## 493 25928 7320
## 494 22827 7315
## 496 45225 7273
## 497 66873 7263
## 499 94338 7245
# Order mydata2 by EVRevenue descending and show top 10
top10 <- mydata2[order(-mydata2$EVRevenue), ]
head(top10, 10)
## name rank year industry
## 152 Nvidia 152 2023 Semiconductors and Other Electronic Components
## 137 Visa 137 2023 Financial Data Services
## 177 Mastercard 177 2023 Financial Data Services
## 499 ServiceNow 499 2023 Computer Software
## 142 Eli Lilly 142 2023 Pharmaceuticals
## 13 Microsoft 13 2023 Computer Software
## 366 S&P Global 366 2023 Financial Data Services
## 233 Adobe 233 2023 Computer Software
## 321 Intuit 321 2023 Computer Software
## 450 Airbnb 450 2023 Internet Services and Retailing
## sector headquarters_state headquarters_city market_value_mil
## 152 Technology CA Santa Clara 686092
## 137 Business Services CA San Francisco 475307
## 177 Business Services NY Purchase 346418
## 499 Technology CA Santa Clara 94338
## 142 Health Care IN Indianapolis 326351
## 13 Technology WA Redmond 2146049
## 366 Business Services NY New York 113066
## 233 Technology CA San Jose 176769
## 321 Technology CA Mountain View 125076
## 450 Technology CA San Francisco 79682
## revenue_mil profit_mil asset_mil employees founder_is_ceo female_ceo
## 152 26974 4368 41182 26196 yes no
## 137 29310 14957 85501 26500 no no
## 177 22237 9930 38724 29900 no no
## 499 7245 325 13299 20433 no no
## 142 28541 6245 49490 39000 no no
## 13 198270 72738 364840 221000 no no
## 366 11181 3248 61784 39950 no no
## 233 17606 4756 27165 29239 no no
## 321 12726 2066 27734 21850 no no
## 450 8399 1893 16038 6811 yes no
## newcomer_to_fortune_500 global_500 profit_margin_percentage EVRevenue
## 152 no no 16.2 25.4
## 137 no no 51.0 16.2
## 177 no no 44.7 15.6
## 499 yes no 4.5 13.0
## 142 no no 21.9 11.4
## 13 no yes 36.7 10.8
## 366 no no 29.0 10.1
## 233 no no 27.0 10.0
## 321 no no 16.2 9.8
## 450 yes no 22.5 9.5
#Scatterplot and histograms for EV/Revenue and Profit Margin at the same time. As foreseen, both distributions are skewed to the right. A weak positive correlation is seen between these 2 variables with many outliers, which also points to the fact that financial markets operate based on many factors, with some of them being expectations for the future, emotions, etc. Nvidia can be seen as the highest unit in the scatter plot (25.4x multiple), whereas VISA can be seen on the far right with 51.0% Profit Margin.
library(ggplot2)
library(ggExtra)
# Swap axes: x = profit_margin_percentage, y = EVRevenue
p <- ggplot(mydata2, aes(x = profit_margin_percentage, y = EVRevenue)) +
geom_point(alpha = 0.6, color = "blue") +
theme_minimal() +
labs(
title = "EV/Revenue vs Profit Margin",
x = "Profit Margin (%)",
y = "EV / Revenue"
)
# Add marginal histograms
ggMarginal(p, type = "histogram", fill = "lightblue", color = "darkblue")
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).
#Histogram for EV/Revenue - removed 24 NAs automatically.
library(ggplot2)
ggplot(mydata2, aes(x = EVRevenue)) +
geom_histogram(binwidth = 0.1, fill = "lightblue", color = "darkblue") +
theme_minimal() +
labs(
title = "Distribution of EV/Revenue",
x = "EV / Revenue",
y = "Frequency"
)
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_bin()`).
library(ggplot2)
library(patchwork)
# Boxplot for EVRevenue
p1 <- ggplot(mydata2, aes(y = EVRevenue)) +
geom_boxplot(fill = "lightblue", color = "darkblue", alpha = 0.7) +
theme_minimal() +
labs(title = "EV/Revenue", y = "EV / Revenue")
# Boxplot for Profit Margin
p2 <- ggplot(mydata2, aes(y = profit_margin_percentage)) +
geom_boxplot(fill = "lightgreen", color = "darkgreen", alpha = 0.7) +
theme_minimal() +
labs(title = "Profit Margin (%)", y = "Profit Margin (%)")
p_combined <- p1 | p2 # use | for side-by-side, not +
p_combined
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
#Task 2
library(readxl)
mydata3 <- read_excel("~/Documents/R Take Home Exam 2025/Task 2/Business School.xlsx")
head(mydata3)
## # A tibble: 6 × 9
## `Student ID` `Undergrad Degree` `Undergrad Grade` `MBA Grade`
## <dbl> <chr> <dbl> <dbl>
## 1 1 Business 68.4 90.2
## 2 2 Computer Science 70.2 68.7
## 3 3 Finance 76.4 83.3
## 4 4 Business 82.6 88.7
## 5 5 Finance 76.9 75.4
## 6 6 Computer Science 83.3 82.1
## # ℹ 5 more variables: `Work Experience` <chr>, `Employability (Before)` <dbl>,
## # `Employability (After)` <dbl>, Status <chr>, `Annual Salary` <dbl>
#Answer: Mode or the most common Undergrad degree is Business, followed by Computer Science and Finance, both with the same number of graduates.
library(ggplot2)
colnames(mydata3)
## [1] "Student ID" "Undergrad Degree" "Undergrad Grade"
## [4] "MBA Grade" "Work Experience" "Employability (Before)"
## [7] "Employability (After)" "Status" "Annual Salary"
if("Undergrad Degree" %in% colnames(mydata3)) {
names(mydata3)[names(mydata3) == "Undergrad Degree"] <- "Undergrad_Degree"
}
ggplot(mydata3, aes(x = Undergrad_Degree)) +
geom_bar(fill = "lightblue", color = "darkblue") +
theme_minimal() +
labs(
title = "Distribution of Undergraduate Degrees",
x = "Undergraduate Degree",
y = "Count"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#desc stats
library(ggplot2)
colnames(mydata3)
## [1] "Student ID" "Undergrad_Degree" "Undergrad Grade"
## [4] "MBA Grade" "Work Experience" "Employability (Before)"
## [7] "Employability (After)" "Status" "Annual Salary"
if("Annual Salary" %in% colnames(mydata3)) {
names(mydata3)[names(mydata3) == "Annual Salary"] <- "Annual_Salary"
}
summary(mydata3$Annual_Salary)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20000 87125 103500 109058 124000 340000
# Desc stat cont'd
sd(mydata3$Annual_Salary, na.rm = TRUE)
## [1] 41501.49
#Histogram
ggplot(mydata3, aes(x = Annual_Salary)) +
geom_histogram(binwidth = 5000, fill = "lightblue", color = "darkblue") +
theme_minimal() +
labs(
title = "Distribution of Annual Salary",
x = "Annual Salary",
y = "Frequency"
)
#Interpretation: Unimodal, right-skewed, central tendency is around 100k, with Mean being more than Median, we can see that we have more extremes towards the high values (high salaries) - two or three outliers.
#t-test
t.test(mydata3$`MBA Grade`, mu = 74)
##
## One Sample t-test
##
## data: mydata3$`MBA Grade`
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
#Interpretation: We can reject H_0 at p-value = 0.00915 (p < 0.05) and accept H_1 - Mean is significantly (statistically) different than 74 at a 95% confidence interval.
#Cohen's d
library(effectsize)
cohens_d(mydata3$`MBA Grade`, mu = 74)
## Cohen's d | 95% CI
## ------------------------
## 0.27 | [0.07, 0.46]
##
## - Deviation from a difference of 74.
#Interpretation: the effect size is small (d = 0.27), so the difference, though real, is not very large in practical terms.
library(readxl)
mydata4 <- read_excel("~/Documents/R Take Home Exam 2025/Task 3/Apartments.xlsx")
head(mydata4)
## # A tibble: 6 × 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
#Comment: They are already factors (0 and 1) in the given data. I will then turn it back into original data (numericals).
mydata4$Parking <- factor(mydata4$Parking,
levels = c(0, 1),
labels = c("No", "Yes"))
head(mydata4)
## # A tibble: 6 × 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <fct> <dbl>
## 1 7 28 1640 No 1
## 2 18 1 2800 Yes 0
## 3 7 28 1660 No 0
## 4 28 29 1850 No 1
## 5 18 18 1640 Yes 1
## 6 28 12 1770 No 1
mydata4$Parking <- factor(mydata4$Parking,
levels = c("No", "Yes"),
labels = c(0, 1))
head(mydata4)
## # A tibble: 6 × 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <fct> <dbl>
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
t.test(mydata4$Price, mu = 1900)
##
## One Sample t-test
##
## data: mydata4$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
#Interpretation: We can reject H_0 at p-value = 0.004731 (p < 0.05) and accept H_1 - Mean is significantly (statistically) different than 1900 at a 95% confidence interval.
fit1 <- lm(Price ~ Age, data = mydata4)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = mydata4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
#Interpretation: The negative coefficient of Age indicates that Price decreases by about 8.98 units per year, and this effect is statistically significant (p = 0.034). The correlation between Price and Age is approximately -0.23, showing a weak negative relationship. The coefficient of determination, R^2 = 0.053, means only 5.3% of Price variation is explained by Age, so most variation is due to other factors. Age has a significant but limited effect on Price.
pairs(~Price + Age + Distance, data = mydata4,
main = "Scatterplot Matrix of Price, Age, and Distance")
#Correlation between Age and Distance
cor(mydata4[, c("Age", "Distance")])
## Age Distance
## Age 1.00000000 0.04290813
## Distance 0.04290813 1.00000000
#Interpretation: The correlation between Age and Distance is 0.043, which indicates almost no linear relationship between these two explanatory variables.
fit2 <- lm(Price ~ Age + Distance, data = mydata4)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
library(car)
## Loading required package: carData
vif(fit2)
## Age Distance
## 1.001845 1.001845
#VIF is very close to 1, which indicates that there is virtually no multicollinearity between Age and Distance. The regression coefficients should be reliable.
std_res <- rstandard(fit2)
cooks_d <- cooks.distance(fit2)
n <- nrow(mydata4)
outliers <- which(abs(std_res) > 2)
influential <- which(cooks_d > 4/n)
problematic_units <- unique(c(outliers, influential))
problematic_units
## [1] 33 38 53 22 55
mydata4_clean <- mydata4[-problematic_units, ]
fit2_clean <- lm(Price ~ Age + Distance, data = mydata4_clean)
summary(fit2_clean)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata4_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
std_res_clean <- rstandard(fit2_clean)
std_fitted_clean <- scale(fit2_clean$fitted.values)
plot(std_fitted_clean, std_res_clean,
xlab = "Standardized Fitted Values",
ylab = "Standardized Residuals",
main = "Residuals vs Fitted Values (fit2_clean)")
abline(h = 0, col = "red", lty = 2)
#Interpretation: We can see that we may have Heteroskedasticity (Homosked. - variance of residuals is constant --> this assumption is probably corrupted).
shapiro.test(std_res_clean)
##
## Shapiro-Wilk normality test
##
## data: std_res_clean
## W = 0.94156, p-value = 0.001168
#Interpretation: The Shapiro-Wilk test for the standardized residuals gives W = 0.942 with a p-value of 0.001, indicating that the residuals are not normally distributed as first thought by looking at the graph.
fit2_clean <- lm(Price ~ Age + Distance, data = mydata4_clean)
summary(fit2_clean)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata4_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
#Interpretation: Intercept (2502.47): This is the predicted Price when both Age and Distance are zero, representing the baseline price. Age (-8.67): Holding Distance constant, Price decreases by approximately 8.67 units for each additional year of Age. This effect is statistically significant (p = 0.0087), indicating older items tend to be cheaper. Distance (-24.06): Holding Age constant, Price decreases by about 24.06 units for each additional unit of Distance. This is highly significant (p < 0.001), showing that greater Distance strongly lowers Price.
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = mydata4_clean)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = mydata4_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.93 -198.19 -53.64 186.73 518.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2393.316 93.930 25.480 < 2e-16 ***
## Age -7.970 3.191 -2.498 0.0147 *
## Distance -21.961 2.830 -7.762 3.39e-11 ***
## Parking1 128.700 60.801 2.117 0.0376 *
## Balcony 6.032 57.307 0.105 0.9165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389
## F-statistic: 24.08 on 4 and 75 DF, p-value: 7.764e-13
anova(fit2_clean, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 77 5077362
## 2 75 4791128 2 286234 2.2403 0.1135
#Interpretation: Since p = 0.1135 > 0.05, we fail to reject H₀. Adding the categorical variables Parking and Balcony does not significantly improve the model fit over the simpler model with just Age and Distance. Therefore, fit2_clean may be sufficient for explaining Price.
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = mydata4_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.93 -198.19 -53.64 186.73 518.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2393.316 93.930 25.480 < 2e-16 ***
## Age -7.970 3.191 -2.498 0.0147 *
## Distance -21.961 2.830 -7.762 3.39e-11 ***
## Parking1 128.700 60.801 2.117 0.0376 *
## Balcony 6.032 57.307 0.105 0.9165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389
## F-statistic: 24.08 on 4 and 75 DF, p-value: 7.764e-13
#Interpretation: Parking --> Given the values of Age, Distance, and the Balcony being in the reference category (No), the price per m² of an apartment with Parking (Parking = Yes) is on average 128.7 EUR higher than an apartment without Parking, holding all other variables constant. Balcony --> given the values of Age, Distance, and Parking being in their reference category (Parking = No), the price per m² of an apartment with a Balcony is on average 6.03 EUR higher than an apartment without a Balcony, holding other variables constant.
#Hypothesis: H_0: beta_Age = beta_Distance = beta_ParkingYes = beta_BalconyYes = 0
mydata4_clean$ID <- paste0("ID", 1:nrow(mydata4_clean))
fitted_values <- fitted(fit3)
observed_price_ID2 <- mydata4_clean$Price[mydata4_clean$ID == "ID2"]
predicted_price_ID2 <- fitted_values[mydata4_clean$ID == "ID2"]
residual_ID2 <- observed_price_ID2 - predicted_price_ID2
residual_ID2
## 2
## 443.4026