Loading the Gapminder Data
There are two different ways to load the gapminder data: 1) using a defined function, or 2) using the older gapminder dataset that is embedded in the Plotly library.
1. User-defined function
Here's the one function to load the Gapminder data based on the Systema Globalis dataset.
/ !pip install plotly // uncomment if using Google Colab
import numpy as np
import pandas as pd
import plotly.express as px
def load_gapminder_data():
"""Create a pandas DataFrame with some of the key indicators from the
Open Numbers Systema Globalis dataset."""
# pin to a specific github tag
base_url = "https://raw.githubusercontent.com/open-numbers/ddf--gapminder--systema_globalis/v1.20.1/"
# load income per person
df_income = pd.read_csv(base_url + "ddf--datapoints--income_per_person_gdppercapita_ppp_inflation_adjusted--by--geo--time.csv")
# load life expectancy
df_life_expectancy = pd.read_csv(base_url + "ddf--datapoints--life_expectancy_at_birth_with_projections--by--geo--time.csv")
# load population size
df_population = pd.read_csv(base_url + "ddf--datapoints--population_total--by--geo--time.csv")
# load child mortality
df_child_mortality = pd.read_csv(base_url + "ddf--datapoints--child_mortality_0_5_year_olds_dying_per_1000_born--by--geo--time.csv")
# load country attributes
df_countries = pd.read_csv(base_url + "ddf--entities--geo--country.csv")
# rename some columns in the countries dataframe to help with merging
df_countries = (
df_countries
[["country", "name", "world_4region", "world_6region"]]
.rename(columns={"country": "geo", "name": "country"})
)
# capitalise regions
df_countries["world_4region"] = df_countries["world_4region"].str.capitalize()
# join all indicators into a single dataframe
df_gapminder = pd.merge(df_population, df_income, on=["geo", "time"])
df_gapminder = pd.merge(df_gapminder, df_life_expectancy, on=["geo", "time"])
df_gapminder = pd.merge(df_gapminder, df_child_mortality, on=["geo", "time"])
df_gapminder = pd.merge(df_gapminder, df_countries, on="geo")
# rename some columns to be more concise
df_gapminder = df_gapminder.rename(
columns={
"time": "year",
"population_total": "population",
"income_per_person_gdppercapita_ppp_inflation_adjusted": "income_per_person",
"life_expectancy_at_birth_with_projections": "life_expectancy",
"child_mortality_0_5_year_olds_dying_per_1000_born": "child_mortality",
}
)
# keep only data between 1950 and 2021 - it's less jumpy
df_gapminder = df_gapminder.query("1950 <= year <= 2021").reset_index(drop=True)
# tidy up columns
df_gapminder.drop(columns=["geo"], inplace=True)
df_gapminder.insert(0, "country", df_gapminder.pop("country")) # move country column to the front
return df_gapminder
df_gapminder = load_gapminder_data()
df_gapminder
2. Import data from Plotly
The Plotly library also has a gapminder dataset that ends in 2007.
import plotly.express as px
import pandas as pd
df = px.data.gapminder()
Last updated