In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data= pd.read_csv('Downloads\\master.csv')
#datanew= pd.read_csv('C:\\Users\\alvir\\Downloads\\master_wb.csv')
In C:\Users\alvir\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\alvir\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\alvir\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In C:\Users\alvir\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\alvir\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The savefig.jpeg_quality rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\alvir\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The keymap.all_axes rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\alvir\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The animation.avconv_path rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\alvir\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The animation.avconv_args rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.

Looking at Data

In [2]:
data.columns.values
Out[2]:
array(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
       'suicides/100k pop', 'country-year', 'HDI for year',
       ' gdp_for_year ($) ', 'gdp_per_capita ($)', 'generation'],
      dtype=object)
In [3]:
#datanew.columns.values
In [4]:
data.shape
Out[4]:
(27820, 12)
In [5]:
data.head()
Out[5]:
country year sex age suicides_no population suicides/100k pop country-year HDI for year gdp_for_year ($) gdp_per_capita ($) generation
0 Albania 1987 male 15-24 years 21 312900 6.71 Albania1987 NaN 2,156,624,900 796 Generation X
1 Albania 1987 male 35-54 years 16 308000 5.19 Albania1987 NaN 2,156,624,900 796 Silent
2 Albania 1987 female 15-24 years 14 289700 4.83 Albania1987 NaN 2,156,624,900 796 Generation X
3 Albania 1987 male 75+ years 1 21800 4.59 Albania1987 NaN 2,156,624,900 796 G.I. Generation
4 Albania 1987 male 25-34 years 9 274300 3.28 Albania1987 NaN 2,156,624,900 796 Boomers
In [6]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 12 columns):
country               27820 non-null object
year                  27820 non-null int64
sex                   27820 non-null object
age                   27820 non-null object
suicides_no           27820 non-null int64
population            27820 non-null int64
suicides/100k pop     27820 non-null float64
country-year          27820 non-null object
HDI for year          8364 non-null float64
 gdp_for_year ($)     27820 non-null object
gdp_per_capita ($)    27820 non-null int64
generation            27820 non-null object
dtypes: float64(2), int64(4), object(6)
memory usage: 2.5+ MB
In [7]:
Numerical=['year', 'suicides_no', 'population', 'suicides/100k pop','HDI for year', 'gdp_for_year ($)', 'gdp_per_capita ($)']
Categorical =['country', 'sex', 'age', 'generation', 'country-year'] 
 
In [8]:
contries= data['country'].unique()
print('Countries count:', len(contries))
Countries count: 101
In [9]:
age= data['age'].unique()
age
Out[9]:
array(['15-24 years', '35-54 years', '75+ years', '25-34 years',
       '55-74 years', '5-14 years'], dtype=object)
In [10]:
age= data['generation'].unique()
age
Out[10]:
array(['Generation X', 'Silent', 'G.I. Generation', 'Boomers',
       'Millenials', 'Generation Z'], dtype=object)

Plotting

In [ ]:
 
In [11]:
total_suicides= pd.DataFrame(data.groupby(['age', 'sex'])['suicides/100k pop'].sum().unstack() )
total= pd.DataFrame(total_suicides.sum()).reset_index()
total
Out[11]:
sex 0
0 female 75014.77
1 male 281529.06
In [12]:
Female=data[data['sex']=='female'] 
Male=data[data['sex']=='male'] 
#print('Total Male:', Male['suicides_no'].sum(), 'while for women is:', Female['suicides_no'].sum())
total= Male['suicides/100k pop'].sum() + Female['suicides/100k pop'].sum()
Male_totals= Male['suicides/100k pop'].sum()/total
Female_totals= Female['suicides/100k pop'].sum()/total
labels = 'Female', 'Male'
sizes = [Female_totals, Male_totals]
colors = ['lightcoral', 'lightskyblue']
explode = (0.1, 0) 

# Plot
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=140)

plt.axis('equal')
plt.show()
print('Male suicides are:', Male_totals *100, '% more than Female suicides')
print('Male suicide rate is higher than female')
Male suicides are: 78.96057547819576 % more than Female suicides
Male suicide rate is higher than female
In [13]:
age= data.groupby(['sex']).sum().reset_index()

plt.figure(figsize=(8,4))
plt.title('Number of Sucidies per age group and gender from 1985 to 2016')
sns.barplot(x='sex', y='suicides/100k pop', hue='sex', data=age, palette='vlag')
plt.ylabel('Suicides ')
plt.xlabel('sex')
plt.show
print('Number of Suicide is higher for Male than Female')
Number of Suicide is higher for Male than Female
In [ ]:
 
In [14]:
age= data.groupby(['age','sex']).mean().reset_index()

plt.figure(figsize=(8,4))
plt.title('Number of Sucidies over Age Group and Sex')
sns.barplot(x='age', y='suicides/100k pop', hue='sex', data=age, palette='deep')
plt.ylabel('Suicides per 100k')
plt.xlabel('Age Group & Sex')
plt.show
print('Age groups between 35-54 years has a higher suicide number')
Age groups between 35-54 years has a higher suicide number
In [15]:
countries= data.groupby('country').mean().reset_index()
In [16]:
countries
Out[16]:
country year suicides_no population suicides/100k pop HDI for year gdp_per_capita ($)
0 Albania 1999.227273 7.462121 2.360813e+05 3.502879 0.673000 1859.045455
1 Antigua and Barbuda 1999.481481 0.033951 6.142679e+03 0.552901 0.781667 10448.185185
2 Argentina 2000.000000 221.018817 2.784907e+06 10.469328 0.779600 7914.096774
3 Armenia 2002.791946 6.392617 2.595576e+05 3.275872 0.690125 1873.919463
4 Aruba 2004.285714 0.601190 7.498077e+03 9.503095 NaN 24221.642857
... ... ... ... ... ... ... ...
96 United Arab Emirates 2007.500000 8.638889 5.069760e+05 1.317917 0.825000 42162.000000
97 United Kingdom 2000.000000 367.755376 4.674107e+06 7.502473 0.863500 31908.354839
98 United States 2000.000000 2779.604839 2.165061e+07 13.819812 0.891600 39269.612903
99 Uruguay 2000.214286 39.101190 2.502052e+05 19.461190 0.746556 7622.071429
100 Uzbekistan 2001.318182 131.829545 1.842510e+06 8.099129 0.650000 976.181818

101 rows × 7 columns

In [17]:
countries= data.groupby('country').mean().reset_index()

plt.figure(figsize=(20,10))
plt.title('Top 5 Countries with most Suicicdes Per 100k Population over years 1987-2015')
top= countries.sort_values(by='suicides/100k pop', ascending=False)[:5]
sns.barplot(x='country', y='suicides/100k pop', data=top, palette='bright')
plt.ylabel('Suicides Per 100k ')
plt.xlabel('Countries')
plt.show
print('Russian Federation has a higher suicide number')
Russian Federation has a higher suicide number
In [18]:
# Generation wise comparision of suicide rate using bar graph.
age= data.groupby(['age','sex']).mean().reset_index()

generation_plot = sns.barplot(x='sex', y='suicides_no', hue='age', data=age)
plt.title("Age wise comparision of suicide rate",fontsize=12)
plt.xlabel("Sex",fontsize=13)
plt.ylabel("Suicide Count",fontsize=13)
plt.show()
In [ ]:
 
In [19]:
age= data['year'].unique()
age
Out[19]:
array([1987, 1988, 1989, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
       2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       1985, 1986, 1990, 1991, 2012, 2013, 2014, 2015, 2011, 2016],
      dtype=int64)
In [ ]:
 
In [20]:
total_suicides= pd.DataFrame(data['gdp_per_capita ($)'].groupby( data['country']).mean().reset_index())
ploting = total_suicides.sort_values(by=['gdp_per_capita ($)','country'], ascending=False)
plt.figure(figsize=(14,18))
plt.title('Countries by GDP Per Capita')
sns.barplot(x='gdp_per_capita ($)', y='country', data=ploting, palette='dark')
plt.ylabel('Countries')
plt.xlabel('GDP Per Capita ($)')

plt.show
Out[20]:
<function matplotlib.pyplot.show(close=None, block=None)>
In [21]:
data['year'].unique()
Out[21]:
array([1987, 1988, 1989, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
       2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       1985, 1986, 1990, 1991, 2012, 2013, 2014, 2015, 2011, 2016],
      dtype=int64)
In [22]:
#Plot by year (line)
gsd_year=pd.DataFrame(data.groupby(['year','country'])['suicides/100k pop'].mean().unstack())
gsd_year['Suicide'] = gsd_year.sum(axis=1)

gsd_year.loc[:,'Suicide'].plot(kind='line',figsize=(10,6),marker='o')
plt.title('Suicide Trend Over Year')
plt.xlabel('Year(s)')
plt.ylabel('Suicids per 100K')
plt.grid()
plt.show()
In [23]:
#Trend over time by Sex 
In [24]:
#Plot by year (line)
years=pd.DataFrame(data.groupby(['year','country'])['gdp_per_capita ($)'].mean().unstack())

years['Suicides'] = years.mean(axis=1)

years.loc[:,'Suicides'].plot(kind='line',figsize=(10,6),marker='o')
plt.title('GDP Per Capita Trend Over Years')
plt.xlabel('Year(s)')
plt.ylabel('GDP Per Capita')
plt.grid()

plt.show()
In [ ]:
 
In [25]:
print("Minimum Suicides per 100k Population: {}".format(data['suicides/100k pop'].min()))
print("Maximum Suicides per 100k Population: {}".format(data['suicides/100k pop'].max()))
print("Average Suicides per 100k Population: {}".format(data['suicides/100k pop'].mean()))
Minimum Suicides per 100k Population: 0.0
Maximum Suicides per 100k Population: 224.97
Average Suicides per 100k Population: 12.816097411933894
In [26]:
plt.figure(figsize=(7,5))
sns.heatmap(data.corr(),annot = True)
plt.title("Figure4: Correlation",fontsize=12)
plt.show()
In [27]:
# Number of countries contributed each year.
def count_country(group):
    return len(group.country.unique())
country_no = data.groupby('year').apply(count_country)
country_no.name = 'number_of_countries'
country_no
Out[27]:
year
1985    48
1986    48
1987    54
1988    49
1989    52
1990    64
1991    64
1992    65
1993    65
1994    68
1995    78
1996    77
1997    77
1998    79
1999    83
2000    86
2001    88
2002    86
2003    86
2004    84
2005    84
2006    85
2007    86
2008    85
2009    89
2010    88
2011    86
2012    81
2013    80
2014    78
2015    62
2016    16
Name: number_of_countries, dtype: int64
In [28]:
# Create a new data frame with feature "number_of_countries" which contributed data for each year.
yr_grouped = data.groupby('year')
annual_pop = yr_grouped.population.sum()
annual_suicides = yr_grouped.suicides_no.sum()
country_df = pd.concat([country_no,annual_suicides,annual_pop],axis = 1)
country_df.head()
Out[28]:
number_of_countries suicides_no population
year
1985 48 116063 1008600086
1986 48 120670 1029909613
1987 54 126842 1095029726
1988 49 121026 1054094424
1989 52 160244 1225514347
In [ ]:
 
In [29]:
# gdp_per_captia with respect to each country using bar plot.
data_gdp = data[['country','gdp_per_capita ($)']].groupby(['country']).sum()
data_gdp.plot(kind='bar', figsize=(40,10), fontsize=25)
plt.title("Figure7: Country wise comparision of GDP per capita",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Suicide Count",fontsize=40)
plt.show()