Learning Python is crucial for any aspiring data science practitioner. Learn to visualize real data with Matplotlib's functions and get acquainted with data structures such as the dictionary and the pandas DataFrame.After covering key concepts such as boolean logic, control flow, and loops in Python, you'll be ready to blend together everything you've learned to solve a case study using hacker statistics._
Loading Data
import pandas as pd
import numpy as np
data=pd.read_csv('gapminder.csv')
data.head()
Unnamed: 0 | country | year | population | cont | life_exp | gdp_cap | |
---|---|---|---|---|---|---|---|
0 | 11 | Afghanistan | 2007 | 31889923.0 | Asia | 43.828 | 974.580338 |
1 | 23 | Albania | 2007 | 3600523.0 | Europe | 76.423 | 5937.029526 |
2 | 35 | Algeria | 2007 | 33333216.0 | Africa | 72.301 | 6223.367465 |
3 | 47 | Angola | 2007 | 12420476.0 | Africa | 42.731 | 4797.231267 |
4 | 59 | Argentina | 2007 | 40301927.0 | Americas | 75.320 | 12779.379640 |
data['life_exp']
0 43.828 1 76.423 2 72.301 3 42.731 4 75.320 ... 137 74.249 138 73.422 139 62.698 140 42.384 141 43.487 Name: life_exp, Length: 142, dtype: float64
Line plot - GDP vs Life Expectancy
import matplotlib.pyplot as plt
# Make a line plot, gdp_cap on the x-axis, life_exp on the y-axis
plt.plot(data['gdp_cap'],data['life_exp'])
# Display the plot
plt.show()
Scatter Plot - GDP vs Life Expectancy
# Change the line plot below to a scatter plot
plt.scatter(data['gdp_cap'],data['life_exp'])
# Put the x-axis on a logarithmic scale
plt.xscale('log')
# Show plot
plt.show()
Line Plot - Population vs Life Expectancy
# Build Scatter plot
plt.plot(data['population'],data['life_exp'])
# Show plot
plt.show()
Scatter Plot - Population vs Life Expectancy
# Build Scatter plot
plt.scatter(data['population'],data['life_exp'])
# Show plot
plt.show()
Histogram
# Create histogram of life_exp data
plt.hist(data['life_exp'])
# Display histogram
plt.show()
# Build histogram with 5 bins
plt.hist(data['life_exp'],bins=5)
# Show and clean up plot
plt.show()
plt.clf()
# Build histogram with 20 bins
plt.hist(data['life_exp'],bins=20)
# Show and clean up again
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
Life Expectancy in 2007 vs Life Expectancy in 1950
life_exp1950=np.array([28.8 , 55.23, 43.08, 30.02, 62.48, 69.12, 66.8 , 50.94, 37.48,
68. , 38.22, 40.41, 53.82, 47.62, 50.92, 59.6 , 31.98, 39.03,
39.42, 38.52, 68.75, 35.46, 38.09, 54.74, 44. , 50.64, 40.72,
39.14, 42.11, 57.21, 40.48, 61.21, 59.42, 66.87, 70.78, 34.81,
45.93, 48.36, 41.89, 45.26, 34.48, 35.93, 34.08, 66.55, 67.41,
37. , 30. , 67.5 , 43.15, 65.86, 42.02, 33.61, 32.5 , 37.58,
41.91, 60.96, 64.03, 72.49, 37.37, 37.47, 44.87, 45.32, 66.91,
65.39, 65.94, 58.53, 63.03, 43.16, 42.27, 50.06, 47.45, 55.56,
55.93, 42.14, 38.48, 42.72, 36.68, 36.26, 48.46, 33.68, 40.54,
50.99, 50.79, 42.24, 59.16, 42.87, 31.29, 36.32, 41.72, 36.16,
72.13, 69.39, 42.31, 37.44, 36.32, 72.67, 37.58, 43.44, 55.19,
62.65, 43.9 , 47.75, 61.31, 59.82, 64.28, 52.72, 61.05, 40. ,
46.47, 39.88, 37.28, 58. , 30.33, 60.4 , 64.36, 65.57, 32.98,
45.01, 64.94, 57.59, 38.64, 41.41, 71.86, 69.62, 45.88, 58.5 ,
41.22, 50.85, 38.6 , 59.1 , 44.6 , 43.58, 39.98, 69.18, 68.44,
66.07, 55.09, 40.41, 43.16, 32.55, 42.04, 48.45])
# Histogram of life_exp, 15 bins
plt.hist(data['life_exp'],bins=15)
# Show and clear plot
plt.show()
plt.clf()
# Histogram of life_exp1950, 15 bins
plt.hist(life_exp1950,bins=15)
# Show and clear plot again
plt.show()
plt.clf()
<Figure size 432x288 with 0 Axes>
Labels
# Basic scatter plot, log scale
plt.scatter(data['gdp_cap'],data['life_exp'])
plt.xscale('log')
# Strings
xlab = 'GDP per Capita [in USD]'
ylab = 'Life Expectancy [in years]'
title = 'World Development in 2007'
# Add axis labels
plt.xlabel(xlab)
plt.ylabel(ylab)
# Add title
plt.title(title)
# After customizing, display the plot
plt.show()
Ticks
# Scatter plot
plt.scatter(data['gdp_cap'],data['life_exp'])
# Previous customizations
plt.xscale('log')
plt.xlabel('GDP per Capita [in USD]')
plt.ylabel('Life Expectancy [in years]')
plt.title('World Development in 2007')
# Definition of tick_val and tick_lab
tick_val = [1000, 10000, 100000]
tick_lab = ['1k', '10k', '100k']
# Adapt the ticks on the x-axis
plt.xticks(tick_val,tick_lab)
# After customizing, display the plot
plt.show()
Sizes
# Import numpy as np
import numpy as np
# Store pop as a numpy array: np_pop
pop = data['population']/1000000
np_pop=np.array(pop)
# Double np_pop
np_pop=np_pop*2
# Update: set s argument to np_pop
plt.scatter(data['gdp_cap'],data['life_exp'], s = np_pop)
# Previous customizations
plt.xscale('log')
plt.xlabel('GDP per Capita [in USD]')
plt.ylabel('Life Expectancy [in years]')
plt.title('World Development in 2007')
plt.xticks([1000, 10000, 100000],['1k', '10k', '100k'])
# Display the plot
plt.show()
Colors
dict = {
'Asia':'red',
'Europe':'green',
'Africa':'blue',
'Americas':'yellow',
'Oceania':'black'
}
col = [dict[i] for i in list(data['cont'])]
col ;
# Specify c and alpha inside plt.scatter()
plt.scatter(x = data['gdp_cap'], y = data['life_exp'], s = np.array(pop) * 2,c=col,alpha=0.8)
# Previous customizations
plt.xscale('log')
plt.xlabel('GDP per Capita [in USD]')
plt.ylabel('Life Expectancy [in years]')
plt.title('World Development in 2007')
plt.xticks([1000,10000,100000], ['1k','10k','100k'])
# Show the plot
plt.show()
# Scatter plot
plt.scatter(x = data['gdp_cap'], y = data['life_exp'], s = np.array(pop) * 2, c = col, alpha = 0.8)
# Previous customizations
plt.xscale('log')
plt.xlabel('GDP per Capita [in USD]')
plt.ylabel('Life Expectancy [in years]')
plt.title('World Development in 2007')
plt.xticks([1000,10000,100000], ['1k','10k','100k'])
# Additional customizations
plt.text(1550, 71, 'India')
plt.text(5700, 80, 'China')
# Add grid() call
plt.grid(True)
# Show the plot
plt.show()
data[data['country']=='Ghana']
data[data['country']=='China']
data[data['country']=='India']
data['country'].sort_values()[134]
data[data['country']=='United States']
# Scatter plot
fig, ax = plt.subplots(figsize=(10,6))
plt.scatter(x = data['gdp_cap'], y = data['life_exp'], s = np.array(pop) * 2, c = col, alpha = 0.8)
# Previous customizations
plt.xscale('log')
plt.xlabel('GDP per Capita [in USD]')
plt.ylabel('Life Expectancy [in years]')
plt.title('World Development in 2007')
plt.xticks([1000,10000,100000], ['1k','10k','100k'])
# Additional customizations
plt.text(2452, 64, 'India')
plt.text(4959, 72, 'China')
plt.text(1300, 60, 'Ghana')
plt.text(42951, 78, 'United States')
# Add grid() call
plt.grid(True)
# Show the plot
plt.show()