This is an exploratory data analysis of stock prices.
You need to obtain the data using pandas_datareader. We will get stock information for the following banks:
https://pandas-datareader.readthedocs.io/en/latest/remote_data.html
datareader to grab info on the stock.> # Bank of America
+ BAC = data.DataReader("BAC", 'yahoo', start, end)
+
+ # CitiGroup
+ C = data.DataReader("C", 'yahoo', start, end)
+
+ # Goldman Sachs
+ GS = data.DataReader("GS", 'yahoo', start, end)
+
+ # JPMorgan Chase
+ JPM = data.DataReader("JPM", 'yahoo', start, end)
+
+ # Morgan Stanley
+ MS = data.DataReader("MS", 'yahoo', start, end)
+
+ # Wells Fargo
+ WFC = data.DataReader("WFC", 'yahoo', start, end)BAC.head()
> # Displayed using R instead of Python
>
> library(tidyverse)
> library(knitr) #load library for table formatting
> library(kableExtra) #load library for table formatting
>
> kable(head(py$BAC)) %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 12,position="left")%>%
+ row_spec(0,background="lightpink")| High | Low | Open | Close | Volume | Adj Close | |
|---|---|---|---|---|---|---|
| 2006-01-03 | 47.18 | 46.15 | 46.92 | 47.08 | 16296700 | 34.81173 |
| 2006-01-04 | 47.24 | 46.45 | 47.00 | 46.58 | 17757900 | 34.44201 |
| 2006-01-05 | 46.83 | 46.32 | 46.58 | 46.64 | 14970700 | 34.48639 |
| 2006-01-06 | 46.91 | 46.35 | 46.80 | 46.57 | 12599800 | 34.43462 |
| 2006-01-09 | 46.97 | 46.36 | 46.72 | 46.60 | 15619400 | 34.45681 |
| 2006-01-10 | 46.51 | 45.88 | 46.40 | 46.21 | 15634600 | 34.16842 |
bank_stocks. Set the keys argument equal to the tickers list.> bank_stocks.columns.names = ['Bank Ticker','Stock Info']
+
+ bank_stocks_d = bank_stocks.iloc[0:5,0:7]
+
+ # Displayed using R instead of Python (Code not shown)Although not displayed well, there is an upper column named Bank Ticker with the ticker names and a second column named Stock Info which holds the data. Each level can be referenced.
| BAC High | Low | Open | Close | Volume | Adj Close | C High | |
|---|---|---|---|---|---|---|---|
| 2006-01-03 | 47.18 | 46.15 | 46.92 | 47.08 | 16296700 | 34.81173 | 493.8 |
| 2006-01-04 | 47.24 | 46.45 | 47.00 | 46.58 | 17757900 | 34.44201 | 491.0 |
| 2006-01-05 | 46.83 | 46.32 | 46.58 | 46.64 | 14970700 | 34.48639 | 487.8 |
| 2006-01-06 | 46.91 | 46.35 | 46.80 | 46.57 | 12599800 | 34.43462 | 489.0 |
| 2006-01-09 | 46.97 | 46.36 | 46.72 | 46.60 | 15619400 | 34.45681 | 487.4 |
Read the documentation on Multi-Level Indexing
http://pandas.pydata.org/pandas-docs/stable/advanced.html
And Using.xs
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.xs.html
> bank_max = bank_stocks.xs(key='Adj Close',axis=1,
+ level='Stock Info').max()
+
+ # Displayed using R instead of Python (Code not shown)| max adj close | |
|---|---|
| BAC | 42.09922 |
| C | 501.54697 |
| GS | 207.49736 |
| JPM | 60.65808 |
| MS | 58.26871 |
| WFC | 49.07849 |
\[r_t = \frac{p_t - p_{t-1}}{p_{t-1}} = \frac{p_t}{p_{t-1}} - 1\]
> for tick in tickers:
+ returns[tick+' Return'] = bank_stocks[tick]['Adj Close'].pct_change()
+
+ # Displayed using R instead of Python (Code not shown)| BAC Return | C Return | GS Return | JPM Return | MS Return | WFC Return | |
|---|---|---|---|---|---|---|
| 2006-01-03 | NaN | NaN | NaN | NaN | NaN | NaN |
| 2006-01-04 | -0.0106205 | -0.0184620 | -0.0138125 | -0.0057718 | 0.0006860 | -0.0115986 |
| 2006-01-05 | 0.0012883 | 0.0049608 | -0.0003934 | 0.0030287 | 0.0027423 | -0.0011102 |
| 2006-01-06 | -0.0015012 | 0.0000000 | 0.0141682 | 0.0070460 | 0.0010253 | 0.0058740 |
| 2006-01-09 | 0.0006444 | -0.0047304 | 0.0120310 | 0.0162417 | 0.0105854 | -0.0001575 |
| 2006-01-10 | -0.0083695 | 0.0030995 | 0.0125773 | 0.0014754 | 0.0005068 | -0.0007898 |
> import seaborn as sns
+ import matplotlib.pyplot as plt
+ sns.set_style('darkgrid')
+ sns.pairplot(returns[1:]);
+ plt.show()> # Worst Drop (4 of them on Inauguration day)
+ bank_worst = returns.idxmin()
+
+ # Displayed using R instead of Python (Code not shown)| worst return | |
|---|---|
| BAC Return | 2009-01-20 |
| C Return | 2009-02-27 |
| GS Return | 2009-01-20 |
| JPM Return | 2009-01-20 |
| MS Return | 2008-10-09 |
| WFC Return | 2009-01-20 |
| best return | |
|---|---|
| BAC Return | 2009-04-09 |
| C Return | 2008-11-24 |
| GS Return | 2008-11-24 |
| JPM Return | 2009-01-21 |
| MS Return | 2008-10-13 |
| WFC Return | 2008-07-16 |
| overall std | |
|---|---|
| BAC Return | 0.0366591 |
| C Return | 0.0386710 |
| GS Return | 0.0253863 |
| JPM Return | 0.0276747 |
| MS Return | 0.0377176 |
| WFC Return | 0.0301966 |
> std_2015_py = returns.loc['2015-01-01':'2015-12-31'].std()
+
+ # Displayed using R instead of Python (Code not shown)| 2015 std | |
|---|---|
| BAC Return | 0.0161740 |
| C Return | 0.0152880 |
| GS Return | 0.0140431 |
| JPM Return | 0.0140058 |
| MS Return | 0.0162868 |
| WFC Return | 0.0125520 |
> sns.set_style('whitegrid')
+ plt.figure(figsize=(10,8))
+ sns.distplot(returns.loc['2015-01-01':'2015-12-31']['MS Return'],
+ color='green',bins=100);
+ plt.show()> plt.figure(figsize=(10,8))
+ sns.distplot(returns.loc['2008-01-01':'2008-12-31']['C Return'],
+ color='red',bins=100);
+ plt.show()> for tick in tickers:
+ bank_stocks[tick]['Close'].plot(figsize=(12,6),label=tick)
+ plt.legend()
+ plt.show()> #plotly
+ df = bank_stocks.xs(key='Close',axis=1,level='Stock Info')
+ fig = px.line(df)
+ fig.write_html("stock.html")> plt.figure(figsize=(12,6))
+ BAC['Close'].loc['2008-01-01':'2009-01-01'].rolling(
+ window=30).mean().plot(label='30 Day Avg');
+ BAC['Close'].loc['2008-01-01':'2009-01-01'].plot(
+ label='BAC CLOSE');
+ plt.legend()
+ plt.show()> sns.heatmap(bank_stocks.xs(key='Close',axis=1,
+ level='Stock Info').corr(),annot=True);
+ plt.show()> sns.clustermap(bank_stocks.xs(key='Close',
+ axis=1,level='Stock Info').corr(),annot=True);
+ plt.show()> #plotly
+ close_corr = bank_stocks.xs(key='Close',
+ axis=1,level='Stock Info').corr()
+ fig = px.density_heatmap(close_corr,
+ color_continuous_scale="rdylbu")
+ fig.write_html("density.html")> cand = BAC[['Open', 'High', 'Low',
+ 'Close']].loc['2015-01-01':'2016-01-01']
+ cand.reset_index(inplace=True)> import plotly.graph_objects as go
+
+ fig = go.Figure(data=[go.Candlestick(x=cand['Date'],
+ open=cand['Open'],
+ high=cand['High'],
+ low=cand['Low'],
+ close=cand['Close'])])
+
+ fig.write_html("candle.html")> returns2 = pd.DataFrame()
+ for tick in tickers:
+ returns2[tick] = bank_stocks[tick
+ ]['Adj Close'].pct_change()+1
+
+ returns3 = returns2.cumprod()-1
+
+ returns3 = returns3.fillna(0)
+ returns3.columns.names = ['company']An analysis of 911 call data from Kaggle. The data contains the following fields:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99492 entries, 0 to 99491
Data columns (total 9 columns):
lat 99492 non-null float64
lng 99492 non-null float64
desc 99492 non-null object
zip 86637 non-null float64
title 99492 non-null object
timeStamp 99492 non-null object
twp 99449 non-null object
addr 98973 non-null object
e 99492 non-null int64
dtypes: float64(3), int64(1), object(5)
memory usage: 6.8+ MB
> # Displayed using R instead of Python
>
> library(tidyverse)
> library(knitr) #load library for table formatting
> library(kableExtra) #load library for table formatting
>
> kable(head(py$df)) %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="lightpink")| lat | lng | desc | zip | title | timeStamp | twp | addr | e |
|---|---|---|---|---|---|---|---|---|
| 40.29788 | -75.58129 | REINDEER CT & DEAD END; NEW HANOVER; Station 332; 2015-12-10 @ 17:10:52; | 19525 | EMS: BACK PAINS/INJURY | 2015-12-10 17:40:00 | NEW HANOVER | REINDEER CT & DEAD END | 1 |
| 40.25806 | -75.26468 | BRIAR PATH & WHITEMARSH LN; HATFIELD TOWNSHIP; Station 345; 2015-12-10 @ 17:29:21; | 19446 | EMS: DIABETIC EMERGENCY | 2015-12-10 17:40:00 | HATFIELD TOWNSHIP | BRIAR PATH & WHITEMARSH LN | 1 |
| 40.12118 | -75.35198 | HAWS AVE; NORRISTOWN; 2015-12-10 @ 14:39:21-Station:STA27; | 19401 | Fire: GAS-ODOR/LEAK | 2015-12-10 17:40:00 | NORRISTOWN | HAWS AVE | 1 |
| 40.11615 | -75.34351 | AIRY ST & SWEDE ST; NORRISTOWN; Station 308A; 2015-12-10 @ 16:47:36; | 19401 | EMS: CARDIAC EMERGENCY | 2015-12-10 17:40:01 | NORRISTOWN | AIRY ST & SWEDE ST | 1 |
| 40.25149 | -75.60335 | CHERRYWOOD CT & DEAD END; LOWER POTTSGROVE; Station 329; 2015-12-10 @ 16:56:52; | NaN | EMS: DIZZINESS | 2015-12-10 17:40:01 | LOWER POTTSGROVE | CHERRYWOOD CT & DEAD END | 1 |
| 40.25347 | -75.28324 | CANNON AVE & W 9TH ST; LANSDALE; Station 345; 2015-12-10 @ 15:39:04; | 19446 | EMS: HEAD INJURY | 2015-12-10 17:40:01 | LANSDALE | CANNON AVE & W 9TH ST | 1 |
19401.0 6979
19464.0 6643
19403.0 4854
19446.0 4748
19406.0 3174
Name: zip, dtype: int64
LOWER MERION 8443
ABINGTON 5977
NORRISTOWN 5890
UPPER MERION 5227
CHELTENHAM 4575
Name: twp, dtype: int64
110
.apply() with a custom lambda expression to create a new column called “Reason” that contains this string value.'EMS'
EMS 48877
Traffic 35695
Fire 14920
Name: Reason, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99492 entries, 0 to 99491
Data columns (total 10 columns):
lat 99492 non-null float64
lng 99492 non-null float64
desc 99492 non-null object
zip 86637 non-null float64
title 99492 non-null object
timeStamp 99492 non-null object
twp 99449 non-null object
addr 98973 non-null object
e 99492 non-null int64
Reason 99492 non-null object
dtypes: float64(3), int64(1), object(6)
memory usage: 7.6+ MB
<class 'str'>
pd.to_datetime to convert the column from strings to DateTime objects.http://pandas.pydata.org/pandas-docs/stable/generated/pandas.to_datetime.html
.apply() to create 3 new columns called Hour, Month, and Day of Week.> df['Hour'] = df['timeStamp'].apply(
+ lambda time: time.hour)
+ df['Month'] = df['timeStamp'].apply(
+ lambda time: time.month)
+ df['Day of Week'] = df['timeStamp'].apply(
+ lambda time: time.dayofweek).map() with this dictionary to map the actual string names to the day of the week:> dmap = {0:'Mon',1:'Tue',2:'Wed',3:'Thu',
+ 4:'Fri',5:'Sat',6:'Sun'}
+
+ df['Day of Week'] = df['Day of Week'].map(dmap)> plt.figure(figsize=(8,6))
+
+ sns.countplot(x='Day of Week',data=df,
+ hue='Reason',palette='rainbow');
+
+ # To relocate the legend
+ plt.legend(bbox_to_anchor=(1.05, 1),
+ loc=2, borderaxespad=0.)
+
+ plt.tight_layout()
+ plt.show()> plt.figure(figsize=(8,6))
+
+ sns.countplot(x='Month',data=df,hue='Reason',
+ palette='rainbow');
+
+ # To relocate the legend
+ plt.legend(bbox_to_anchor=(1.05, 1),
+ loc=2, borderaxespad=0.)
+
+ plt.tight_layout()
+ plt.show()It is missing months 9,10,11. A line plot will help.
count() method for aggregation. Use the head() method on this returned DataFrame.| lat | lng | desc | zip | title | timeStamp | twp | addr | e | Reason | Hour | Day of Week |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 13205 | 13205 | 13205 | 11527 | 13205 | 13205 | 13203 | 13096 | 13205 | 13205 | 13205 | 13205 |
| 11467 | 11467 | 11467 | 9930 | 11467 | 11467 | 11465 | 11396 | 11467 | 11467 | 11467 | 11467 |
| 11101 | 11101 | 11101 | 9755 | 11101 | 11101 | 11092 | 11059 | 11101 | 11101 | 11101 | 11101 |
| 11326 | 11326 | 11326 | 9895 | 11326 | 11326 | 11323 | 11283 | 11326 | 11326 | 11326 | 11326 |
| 11423 | 11423 | 11423 | 9946 | 11423 | 11423 | 11420 | 11378 | 11423 | 11423 | 11423 | 11423 |
| 11786 | 11786 | 11786 | 10212 | 11786 | 11786 | 11777 | 11732 | 11786 | 11786 | 11786 | 11786 |
lmplot() to create a linear fit on the number of calls per month.> plt.figure(figsize=(10,8))
+
+ sns.lmplot(x='Month',y='twp',
+ data=byMonth.reset_index());
+
+ plt.show().date() method.count() aggregate and create a plot of counts of 911 calls.> plt.figure(figsize=(10,8))
+
+ df.groupby('Date').count()['twp'].plot();
+
+ plt.tight_layout()
+ plt.show()> plt.figure(figsize=(10,8))
+
+ df[df['Reason']=='Traffic'].groupby(
+ 'Date').count()['twp'].plot()
+ plt.title('Traffic');
+
+ plt.tight_layout()
+ plt.show()> plt.figure(figsize=(10,8))
+
+ df[df['Reason']=='Fire'].groupby(
+ 'Date').count()['twp'].plot()
+ plt.title('Fire');
+
+ plt.tight_layout()
+ plt.show()> plt.figure(figsize=(10,8))
+
+ df[df['Reason']=='EMS'].groupby(
+ 'Date').count()['twp'].plot()
+ plt.title('EMS');
+
+ plt.tight_layout()
+ plt.show()| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Fri | 275 | 235 | 191 | 175 | 201 | 194 | 372 | 598 | 742 | 752 | 803 | 859 | 885 | 890 | 932 | 980 | 1039 | 980 | 820 | 696 | 667 | 559 | 514 | 474 |
| Mon | 282 | 221 | 201 | 194 | 204 | 267 | 397 | 653 | 819 | 786 | 793 | 822 | 893 | 842 | 869 | 913 | 989 | 997 | 885 | 746 | 613 | 497 | 472 | 325 |
| Sat | 375 | 301 | 263 | 260 | 224 | 231 | 257 | 391 | 459 | 640 | 697 | 769 | 801 | 831 | 789 | 796 | 848 | 757 | 778 | 696 | 628 | 572 | 506 | 467 |
| Sun | 383 | 306 | 286 | 268 | 242 | 240 | 300 | 402 | 483 | 620 | 643 | 693 | 771 | 679 | 684 | 691 | 663 | 714 | 670 | 655 | 537 | 461 | 415 | 330 |
| Thu | 278 | 202 | 233 | 159 | 182 | 203 | 362 | 570 | 777 | 828 | 837 | 773 | 889 | 936 | 876 | 969 | 935 | 1013 | 810 | 698 | 617 | 553 | 424 | 354 |
| Tue | 269 | 240 | 186 | 170 | 209 | 239 | 415 | 655 | 889 | 880 | 840 | 838 | 887 | 917 | 943 | 938 | 1026 | 1019 | 905 | 731 | 647 | 571 | 462 | 274 |
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 12 | |
|---|---|---|---|---|---|---|---|---|---|
| Fri | 1970 | 1581 | 1525 | 1958 | 1730 | 1649 | 2045 | 1310 | 1065 |
| Mon | 1727 | 1964 | 1535 | 1598 | 1779 | 1617 | 1692 | 1511 | 1257 |
| Sat | 2291 | 1441 | 1266 | 1734 | 1444 | 1388 | 1695 | 1099 | 978 |
| Sun | 1960 | 1229 | 1102 | 1488 | 1424 | 1333 | 1672 | 1021 | 907 |
| Thu | 1584 | 1596 | 1900 | 1601 | 1590 | 2065 | 1646 | 1230 | 1266 |
| Tue | 1973 | 1753 | 1884 | 1430 | 1918 | 1676 | 1670 | 1612 | 1234 |