# add these two lines underneath the chunk where you have included the use_python line.
import os
os.environ['QT_QPA_PLATFORM_PLUGIN_PATH'] = 'C:/ProgramData/Anaconda3/Library/plugins/platforms'
#Import data and create df
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.ticker as ticker
import folium

from urllib.request import urlretrieve
url = 'https://data.nasa.gov/api/views/dd9e-wu2v/rows.csv?accessType=DOWNLOAD'
dst = 'U:/WorkingLandslide_20022022.csv'
urlretrieve(url, dst)
## ('U:/WorkingLandslide_20022022.csv', <http.client.HTTPMessage object at 0x00000209699D70D0>)
 
path = "U:/"
filename = "WorkingLandslide_20022022.csv"

file_path = path + filename

df = pd.read_csv(file_path)

Introduction

Here is an intro section.

Dataset

Here is something about my data.

Findings

This is some general text about my findings before I show the individual charts in tabs. If you add .tabset-pills inside the curly braces, it will generate orange tab buttons

An Overview

Global Landslide Frequency by Location and Date

Geographic Distribution of Global Landslides

This map visualizes the geographic distribution of landslides recorded in NASA’s global landslide database (from 1988 - 2017) with each point representing a recorded landslide event. The points are color-coded based on the size category of the landslide, ranging from small to catastrophic.

## Latitude/Longitude
map_df = df[['landslide_size', 'longitude', 'latitude']].copy()
print(map_df);
# Create map using 0,0 as center point
center_of_map = [0,0] 

my_map = folium.Map(location = center_of_map,
                   zoom_start=1,
                   width='90%',
                   height='100%',
                   left='5%',
                   right='5%')

# Add different tile layers
folium.TileLayer('stamenterrain').add_to(my_map)
folium.TileLayer('cartodbpositron').add_to(my_map)
folium.TileLayer('cartodbdark_matter').add_to(my_map)
# Add layer control to switch between tile layers
folium.LayerControl().add_to(my_map)
# Color code landslide by size
for i in range(len(map_df)):
    sizelandslide = map_df.loc[i, 'landslide_size']
    if sizelandslide == 'large':
        color = 'yellow'
        tooltip_text = 'Large Landslide'
    elif sizelandslide == 'very_large':
        color = 'orange'
        tooltip_text = 'Very Large Landslide'
    elif sizelandslide == 'catastrophic':
        color = 'red'
        tooltip_text = 'Catastrophic Landslide'
    elif sizelandslide == 'medium':
        color = 'green'
        tooltip_text = 'Medium Landslide'
    elif sizelandslide == 'small':
        color = 'blue'
        tooltip_text = 'Small Landslide'
    else:
        color = 'purple'
        tooltip_text = 'Landslide of Unknown Size'

    try:
        folium.Circle(location=[map_df.loc[i, 'latitude'], map_df.loc[i, 'longitude']],
                      tooltip=tooltip_text,
                      radius=50,
                      color=color,
                      fill=True,
                      fill_color=color,
                      fill_opacity=0.5).add_to(my_map)
    except:
        pass
my_map
Make this Notebook Trusted to load map: File -> Trust Notebook

The geographic distribution of landslides depicted on the map highlights the prevalence of landslide events across the globe. From the visualization, it is evident that landslides vary in size, with small to catastrophic events occurring in different regions. The database used shows a trend of landslides around the equator. This may be because these areas see greater amounts of rains - a common trigger of landslides. However, it should be noted existing literature on landslide-climate studies reveal a geographic bias, with large parts of the world remaining understudied such as Asia, South America, and Africa. Thus, this visualization may be missing some revelant data. Regardless, this map containing preliminary information is crucial for beginning to understand the role of landscape and location in determining landslide risk.

Scatterplot of Global Landslides in Ten-Year Timeframe

This scatterplot visualizes the occurrence of landslides globally over a ten-year period (2007-2016), categorized by month and year. Each point represents the number of landslides recorded for a specific month and year, with the size of the point indicating the relative frequency of landslides. The color gradient also represents the frequency of landslides with darker colors indicating lower frequencies.

# Create columns for year and month 
df['event_date'] = pd.to_datetime(df['event_date'], format='%m/%d/%Y %I:%M:%S %p');
df['Year'] = df['event_date'].dt.year
df['Month'] = df['event_date'].dt.month

# Save a copy of data before omitting anything
df1 = df.copy()
df1;

# Edit data frame to just capture year and month
dfscatter = df[['Year', 'Month']]

# Ensure df captures everything
dfscatter.shape;

# Check if there are any NAs
dfscatter.isna().sum();

# Identify all unique years in the df 
np.unique(dfscatter['Year']);

# Group data by year and month, create counts
w = dfscatter.groupby(['Year', 'Month'])['Year'].count().reset_index(name='count')
w = pd.DataFrame(w)
w;

# Identify the highest/lowest count values
w['count'].max();
w['count'].min();

# Look at the count values
w.Year.value_counts();

# There were 12 counts for the years 2007-2016, meaning there was data populated for each month. Although it is certainly possible there were fewer landslides in years earlier, this ten year timeframe seemed to have the most robust data thus I decided to focus on it for the visualization
w2 = w.loc[w['Year'].isin(range(2007,2017))]

# Check year counts
w2.Year.value_counts();

# Check types of data to be used in visualization
w2.dtypes;

# Count maximum count value now
w2['count'].max();

# When I initially ran the data for the scatterplot, my points on the graph were quite small and it was difficult to compare relative size
# In order to combat this, I decided to scale my data by multiplying my counts by 10
w2['count_tens'] = round(w2['count']*10,0)
w2;

# Check the shape - should have 10 years*12 months = 120 
w2.shape;

# Reset the index
w2 = w2.reset_index(drop=True)
w2;

# Create the scatterplot
plt.figure(figsize=(18,15));
plt.scatter(w2['Month'],w2['Year'], marker='o', cmap='viridis', c=w2['count_tens'], s=w2['count_tens'], edgecolors='white')
plt.title('Global Landslides in Ten-Year Timeframe (by Month)', fontsize=18);
plt.xlabel('Month', fontsize=16);
plt.ylabel('Year', fontsize=16);

cbar=plt.colorbar();
cbar.set_label('Number of Recorded Landslides', rotation=270, fontsize=16, color='black', labelpad=35);

my_colorbar_ticks = [*range(100, int(w2['count_tens'].max()), 100)];
cbar.set_ticks(my_colorbar_ticks);

my_colorbar_tick_labels = [*range(10, int(w2['count'].max()), 10)];
cbar.set_ticklabels(my_colorbar_tick_labels);

my_x_ticks = [*range(w2['Month'].min(), w2['Month'].max()+1, 1)];
plt.xticks(my_x_ticks, fontsize=14);

my_y_ticks = [*range(w2['Year'].min(), w2['Year'].max()+1, 1)];
plt.yticks(my_y_ticks, fontsize=14);

plt.show()

The plot above focuses primarily on a specific subset of the dataset, specifically the ten-year period from 2007 through 2016, as it contains the most comprehensive information. Although the previous map showed more data from the original dataset, this analysis the will focus on this particular time range due to the higher concentration of records within these years. This approach ensures that the analysis is based on the most robust and representative data available. The scatterplot shows the amount of landslides vary by year and month without clear patterns of higher/lower frequency. It should be noted the years 2007 - 2009 have generally lower frequencies of landslides. There are several potential reasons for this including: climate variability, land use and development (where changes in land use practices, such as deforestation, urbanization, or construction activities, could have influenced landslide susceptibility), natural disaster events (where differences in the number of major natural disasters such as earthquakes or hurricanes - which can trigger landslides directly or indirectly through associated factors like heavy rainfall or ground shaking - may have influenced landslide counts), or simply data collection and reporting (where variations in data collection methods, reporting standards, or the availability of monitoring systems over time could have affect the recorded number of landslides).

Climate variability, though, appears crucial in examining this discrepancy. Although there has been steady increases in global temperature since around 1920, NASA reported in 2012, 2011 was the ninth warmest year on record (NASA Finds 2011 Ninth-warmest Year on Record). Such drastic increases in temperature led to predictions of more severe weather even then. Thus, climate variability appears to correlate with the uptick in global landslides from 2007-2009 to 2010-2016 seen in the scatterplot.

Frequency of Landslides by Country (2007-2016)

Combining elements of the information analyzed thusfar, these bar charts depict the frequency of landslides recorded in various countries over a ten-year period (2007-2016). The top and bottom panels of the chart display the 10 countries with the highest and subsequent 50 highest landslide counts, respectively. The bars are color-coded based on whether the landslide frequency is above, below, or within 10% of the average frequency.

##Bar chart
# Limit data used to between 2007-2016 for 10 year timeframe
dfbar = df[(df['Year'] >= 2007) & (df['Year'] <= 2016)]

# Check the years of data being used in df
np.unique(dfbar['Year']);

# Check all columns in df
print(dfbar.columns);
# Limit data frame to relevant data - country names
dfbar = dfbar[['country_name']]
print(dfbar);
# Check type of column
dfbar.dtypes;

# Identify NAs in the df
dfbar.isna().sum();

# Drop rows with NAs - counting and adding a column to show the NAs did not feel applicable to landslide count 
dfbar = dfbar.dropna()
dfbar;

# Run preliminary count for the number of landslides by country
dfbar.country_name.value_counts();

# Create df with the number of landslides by country 
x= df.groupby(['country_name'])['country_name'].count().reset_index(name='count')
x = pd.DataFrame(x)
x;

# Order df in descending order
x = x.sort_values('count', ascending=False)
x;

# Reset the index
x.reset_index(inplace=True, drop=True)
x;

# Create rule for coloring data based on the average
def pick_colors_according_to_mean_count(this_data):
    colors=[]
    avg = this_data['count'].mean()
    for each in this_data['count']:
        if each > avg*1.1:
            colors.append('#59eb91')
        elif each < avg*0.9:
            colors.append('#40E0D0')
        else:
            colors.append('gold')
    return colors

# Create bar charts 
import matplotlib.patches as mpatches

bottom1 = 11
top1 = 60
d2 = x.loc[bottom1:top1]
my_colors1 = pick_colors_according_to_mean_count(d2)
my_colors1
bottom2 = 0
top2 = 10
d3 = x.loc[bottom2:top2]
my_colors2 = pick_colors_according_to_mean_count(d3)

Above = mpatches.Patch(color='#59eb91', label='Above Average')
At = mpatches.Patch(color='gold', label='Within 10% of the Average')
Below = mpatches.Patch(color='#40E0D0', label='Below Average')

fig = plt.figure(figsize=(28,26))
fig.suptitle('Frequency of Landslides by Country (2007-2016):\n Top 60 Countries with Highest Landslide Count', fontsize=30, fontweight='bold')


ax2 = fig.add_subplot(2,1,1)
ax2.bar(d3.country_name, d3['count'], label = 'Count', color=my_colors2)
ax2.legend(handles=[Above, At, Below], fontsize=20)
mean_value_rounded = round(d3['count'].mean(), 2)
plt.axhline(mean_value_rounded, color='black', linestyle='dashed')
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
plt.xticks(rotation=45,fontsize=20)
plt.yticks(fontsize=20)
# Define function to format y-axis labels with commas
def format_with_commas(x, pos):
    return '{:,.0f}'.format(x)

# Create Y-axis tick formatter for subplot as some y-axis labels >999
ax2.yaxis.set_major_formatter(ticker.FuncFormatter(format_with_commas))

ax2.set_title('Top ' + str(top2) + ' Countries with Highest Landslide Count', size=30)
ax2.text(top2-0.3, mean_value_rounded + 50, 'Mean = ' + str(mean_value_rounded), rotation=0, fontsize=20)

ax1 = fig.add_subplot(2,1,2)
ax1.bar(d2.country_name, d2['count'], label = 'Count', color=my_colors1)
ax1.legend(handles=[Above, At, Below], fontsize=20)
plt.axhline(d2['count'].mean(), color='black', linestyle='dashed')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
plt.xticks(rotation=90,fontsize=20)
plt.yticks(fontsize=20)
ax1.set_title('Following Top 50 Countries with Highest Landslide Count', size=30)
ax1.text(top1-13, d2['count'].mean()+2, 'Mean = ' + str(d2['count'].mean()), rotation=0, fontsize=20)

fig.subplots_adjust(hspace = 0.5)
plt.show

The bar charts indicate (supported by the map depicting the geographical distribution of landslides), the majority of recorded landslides (from 2007-2016) occurred within the U.S. with nearly 3,000 landslides on record. Following the U.S. is India and the Philippines. Upon initial observation, the high concentration of landslides in the U.S. may be surprising. The higher frequency, however, may be attributable to several factors. Firstly, again the geographical bias in reported landslides may be under-represent certain areas prone to landslides. This coupled with the, perhaps, inherent bias of the data base source, NASA (a U.S. government agency), may also imply a more robust data subset for the U.S. Secondly, the geography of certain parts of the U.S. may also make it more susceptible to landslides. FEMA reported that in fact: “the Appalachian Mountains, the Rocky Mountains, and the Pacific Coastal Ranges and some parts of Alaska and Hawaii have severe landslide problems” (FEMA) Thus, the high frequency of landslides in the U.S. seems founded, even if relative rank is skewed by the geographic reporting bias.>

The following bar chart depicts the next 60 countries with the highest landslide count in the ten-year time frame. The average among this subset being about 42 recorded landslides per country in the time frame. There are 16 countries with recorded counts exceeding the average 42 recorded landslides, 5 within 10% of this average, and 29 below the reported average. Thus, the spread of recorded landslides is varied. High frequencies of landslides can pose threats to a country’s infrastructure, economy, and simply people. Understanding the regions where landslides tend to occur is critical in preparing these regions for impact landslides may have.

Impact of Landslides

Global Landslide Fatalities

Total Landslide Fatalities by Month (2007-2016)

This line chart illustrates the total number of fatalities resulting from landslides recorded each month over a ten-year period (2007-2016). The chart displays trends in landslide-related fatalities, with each line representing a specific year and color-coded accordingly.

# Create df for line chart
fatalities_df = df.groupby(['Year','Month'])['fatality_count'].sum().reset_index(name='TotalFatalities')
fatalities_df;

# Limit the df to years 2007-2016
fatalities_df = fatalities_df[(fatalities_df['Year'] >= 2007) & (fatalities_df['Year'] <= 2016)]
fatalities_df = fatalities_df.reset_index(drop=True)
fatalities_df;

# Create line chart
fig = plt.figure(figsize = (20,12))
ax = fig.add_subplot(1,1,1)
my_colors = {'2007':'red',
            '2008':'cyan',
            '2009':'gray',
            '2010':'green',
            '2011':'blue',
            '2012':'purple',
            '2013':'orange',
            '2014':'brown',
            '2015':'gold',
            '2016':'pink'}
for key, grp in fatalities_df.groupby(['Year']):
    grp.plot(ax=ax, kind='line',x='Month', y='TotalFatalities', color=my_colors[str(key)], label=key, marker='8')
plt.title('Total Landslide Fatalities by Month', fontsize=23)
ax.set_xlabel('Month', fontsize=19)
ax.set_ylabel('Total Fatalities', fontsize=19, labelpad=20)
ax.tick_params(axis='x', labelsize=18, rotation=0)
ax.tick_params(axis='y', labelsize=18, rotation=0)
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))

ax.set_xticks(np.arange(13))

ax.legend(fontsize=19)

plt.show()

Fatalities among other impacts - should certainly be considered in the investigation of the impact of landslides on a global scale. The line graph shows the number of fatalities resulting from landslides categorized by month and year. Clearly, in June of 2013, there is a spike in recorded fatalities. Upon further research, monsoon rains coupled with certain conditions produced deadly landslides in Uttarakhand, India in June 2013. When referring back to the map, the Himalayan region of India tends to see a greater concentration of landslides - likely due to the monsoons and areas composed of steep slopes (both commonly attributable factors to landslides). However, this particular year and month, the American Meteorological Society reports: “differed from conditions that produced other notorious floods in the Himalayan region in recent years.” They note “During the week preceding… deep convection moistened the mountainsides, making them vulnerable to flooding. However, the precipitation producing the flood was not associated with a deep convective event. Rather, an eastward-propagating upper-level trough in the westerlies extended abnormally far southward, with the jet reaching the Himalayas. The south end of the trough merged with a monsoon low moving westward across India. The merged system produced persistent moist low-level flow oriented normal to the Himalayas that advected large amounts of water vapor into the Uttarakhand region. The flow was moist neutral when it passed over the Himalayan barrier, and orographic lifting produced heavy continuous rain over the region for 2–3 days” (Multiscale Aspects of the Storm Producing the June 2013 Flooding in Uttarakhand, India).

It was these abnormal conditions and large landslides that unfortunately resulted in the loss of nearly 5,000 individuals. Initial estimates at the time predicted massive losses - as seen in the CNN news article from Sunday June 23rd, 2013 - although these preliminary predications ultimately severally underestimated the death total (estimated around 5,000).

There are two other fatality counts that are relatively higher than the majority of the data: one in May 2014 and the other in August 2010. Interestingly, all three of the highest death tolls occur in spring and summer months, perhaps indicating a pattern in seasonal weather patterns that may associate with greater risk of landslides. Although it may not account for all recorded landslides in May of 2014, on May 18th, 2014 there was a deadly landslide in Colombia that made global headlines (CNN: Colombia landslide kills dozens). In August 2010, two extreme weather events occured in the northeast and northwest regions of China. First, in the northeast region of Jilin, between August 1st and 4th, there was several flooding and rain killing nearly 100 people - who may be recorded in the landslide fatality data. More likely, though, the high fatality count reported in August of 2010 resulted from devastating landslides that followed these rainstorms in the northwestern province of Gansu. NASA repored “in Zhouqu county, several villages were leveled by landslides triggered by the rainfall” ultimately resulting in at least 1,435 people reported killed in the landslides and several hundred reported missing (who may have later been presumed dead nearing the total 2,000 fatalities seen in the figure) (NASA: August 2010 Global Hazards).

Total Fatality Count by Landslide Size (2007-2016)

This donut chart presents the distribution of landslide fatalities by landslide size categories over a ten-year period (2007-2016). Each segment of the donut chart represents a landslide size category, with the size of the segment proportional to the total number of fatalities associated with that category.

## Donut chart

#Building df
pie_df = df.groupby(['landslide_size'])['fatality_count'].sum().reset_index(name='TotalFatalities')
pie_df;

custom_order = ['catastrophic', 'large', 'unknown', 'medium', 'small', 'very_large']

# Convert 'landslide_size' to categorical data with custom order
pie_df['landslide_size'] = pd.Categorical(pie_df['landslide_size'], categories=custom_order, ordered=True)

# Sort the DataFrame based on the categorical order
pie_df = pie_df.sort_values(by='landslide_size')

# Reset index
pie_df = pie_df.reset_index(drop=True)

pie_df;

# Sort the DataFrame based on the categorical order
pie_df = pie_df.sort_values(by='landslide_size')

# Reset index if needed
pie_df = pie_df.reset_index(drop=True)

pie_df;

number_outside_colors = len(pie_df.landslide_size.unique())
outside_color_ref_number = np.arange(number_outside_colors)*2
print(outside_color_ref_number);
# Create pie chart
fig = plt.figure(figsize=(13, 13))
ax = fig.add_subplot(1, 1, 1)

colormap = plt.get_cmap("tab20_r")
outer_colors = colormap(outside_color_ref_number)

# Calculate total fatalities
all_fatalities = pie_df.TotalFatalities.sum()

# Create labels for pie chart
label_mapping = {'small': 'Small', 'unknown': 'Unknown', 'catastrophic': 'Catastrophic', 'medium': 'Medium', 'large': 'Large','very_large':'Very Large'}

# Apply the label mapping to the 'landslide_size' column
pie_df['landslide_size'] = pie_df['landslide_size'].map(label_mapping)

# Building the pie chart
pie_df.groupby(['landslide_size'])['TotalFatalities'].sum().plot(
    kind='pie', radius=1, colors=outer_colors, pctdistance=0.7, labeldistance=1.07,
    wedgeprops=dict(edgecolor='white'), textprops={'fontsize': 16},
    autopct=lambda p: '{:.2f}%\n({:,.0f} Fatalities)'.format(p, (p / 100) * all_fatalities),
    startangle=90)

# Add hole in the middle
hole = plt.Circle((0, 0), 0.3, fc='white')
fig1 = plt.gcf()
fig1.gca().add_artist(hole)

ax.axis('equal')
plt.tight_layout()

# Round total fatalities to the nearest integer
total_fatalities_int = int(all_fatalities)

# Add comma to the total fatalities value
total_fatalities_formatted = '{:,.0f}'.format(total_fatalities_int)

# Calculate total fatalities from unknown landslide size
unknown_sum = pie_df[pie_df['landslide_size'] == 'unknown']['TotalFatalities']

# Sum the fatalities to get the total
unknown_sum_int = int(unknown_sum.sum())

# Add information to the center of the pie chart
ax.text(0, 0, 'Total Fatalities: ' + str(total_fatalities_formatted) + '\n(including 134 recorded ' + '\nfatalities from landslides' + '\nof unknown size)', size =14, ha='center', va='center')

# Title formatting
ax.yaxis.set_visible(False)
plt.title('Total Fatality Count by Landslide Size (2007 - 2016)', fontsize=18, ha='center', va='center')
plt.subplots_adjust(top=0.9)

plt.show()

Conclusion

We are now done with charts. Here is some general takeaways from my output.

You can add captions at the bottom of images. To add a caption, include the words fig.cap=“blah blah” inside the {….} at the top of the RMarkdown code you are using to include the image.