7 minute read

In this post, we’re going to create interesting and interactive data visualizations using the sqllite, pandas, plotly, and sklearn libraries.

Background

The data set we’ll be using, the NOAA-GHCN Temperature Dataset, consists of monthly surface temperature recordings taken at weather stations around the world. Two other data sets we’ll be using contain more information about the countries these stations are located in as well as the stations themselves.

Create a Database

To begin, we’re going to create a database to store our three tables: temperatures, stations, and countries. Download the .csv files used below:

import sqlite3
import pandas as pd

# create a database
conn = sqlite3.connect("temps.db")

# temperatures table
temperatures = pd.read_csv("temps.csv")

# stations table
stations = pd.read_csv("station-metadata.csv")

# countries table
countries = pd.read_csv("countries.csv")

temperatures.to_sql("temperatures", conn, if_exists = "replace", index = False)
stations.to_sql("stations", conn, if_exists = "replace", index = False)
countries.to_sql("countries", conn, if_exists = "replace", index = False)

# closing the connection
conn.close()

Write a Query Function

We write a function, query_climate_database(), that takes in country, year_begin, year_end, and month to output temperature readings in country, from the years year_begin to year_end, only for months month as a Pandas dataframe.

def query_climate_database(country, year_begin, year_end, month):
    # connect to database
    conn = sqlite3.connect("temps.db")

    # select joined temperatures and stations tables with year between year_begin and year_end
    cmd = "SELECT T.ID, T.Year, T.VALUE1, T.VALUE2, T.VALUE3, T.VALUE4, T.VALUE5, T.VALUE6, T.VALUE7, T.VALUE8, T.VALUE9, T.VALUE10, T.VALUE11, T.VALUE12, S.LATITUDE, S.LONGITUDE, S.NAME FROM temperatures T LEFT JOIN stations S on T.ID = S.ID WHERE T.Year >= " + str(year_begin) + " AND T.Year <= " + str(year_end) 
    df = pd.read_sql_query(cmd, conn)

    # select countries table with country
    countries_cmd = "SELECT * FROM countries WHERE Name = \"" + country + "\""
    countries = pd.read_sql_query(countries_cmd, conn)
    conn.close()

    # countries' FIPS 10-4 is the first two substrings of df's ID
    df["FIPS 10-4"] = df["ID"].str[0:2]

    # join countries and df on FIPS 10-4
    df = pd.merge(df, countries, on = ["FIPS 10-4"])
    df = df.drop(["ID", "FIPS 10-4", "ISO 3166"], axis = 1)

    # turn columns that won't be stacked into multi-indices
    df = df.set_index(keys=["Year", "LATITUDE", "LONGITUDE", "NAME", "Name")

    # stack monthly temperature readings
    df = df.stack()

    # reset multi-indices
    df = df.reset_index()
    df = df.rename(columns = {"Name" : "Country", "level_5"  : "Month" , 0 : "Temp"})

    # removing "VALUE" from Month column
    df["Month"] = df["Month"].str[5:].astype(int)

    # subset of dataframe where Month is month
    df = df[df["Month"] == month]
    df["Temp"] = df["Temp"] / 100
    df = df[["NAME", "LATITUDE", "LONGITUDE", "Country", "Year", "Month", "Temp"]]
    df = df.set_index(pd.Index(range(len(df))))
    return(df)

Once the function is written, we can pass in sample inputs to ensure it runs correctly.

query_climate_database(country = "India", 
                       year_begin = 1980, 
                       year_end = 2020,
                        month = 1)
NAME LATITUDE LONGITUDE Country Year Month Temp
0 PBO_ANANTAPUR 14.583 77.633 India 1980 1 23.48
1 PBO_ANANTAPUR 14.583 77.633 India 1981 1 24.57
2 PBO_ANANTAPUR 14.583 77.633 India 1982 1 24.19
3 PBO_ANANTAPUR 14.583 77.633 India 1983 1 23.51
4 PBO_ANANTAPUR 14.583 77.633 India 1984 1 24.81
... ... ... ... ... ... ... ...
3147 DARJEELING 27.050 88.270 India 1983 1 5.10
3148 DARJEELING 27.050 88.270 India 1986 1 6.90
3149 DARJEELING 27.050 88.270 India 1994 1 8.10
3150 DARJEELING 27.050 88.270 India 1995 1 5.60
3151 DARJEELING 27.050 88.270 India 1997 1 5.70

Write a Geographic Scatter Function for Yearly Temperature Increases

Next, we write the temperature_coefficient_plot() function, which takes the same arguments as query_climate_database(), in addition to min_obs, the minimum number of years temperature recordings were taken for the station to be displayed on our geographic scatterplot, as well as **kwargs, additional arguments to be passed to scatter_mapbox(). The function outputs a geographic scatterplot displaying stations in country, with the color of the station reflecting the average year-over-year change in temperature from year_begin to year_end, in month.

# import plotly express and a library for linear regression
from plotly import express as px 
from sklearn.linear_model import LinearRegression

# additional function that outputs the slope of the linear regeression model when regressing temperature on year 
def coef(data_group):
    x = data_group[["Year"]]
    y = data_group["Temp"]  
    LR = LinearRegression()
    LR.fit(x, y)
    return LR.coef_[0]

def temperature_coefficient_plot(country, year_begin, year_end, month, min_obs, **kwargs):
    # query climate database for input variables to output dataframe
    df = query_climate_database(country, year_begin, year_end, month)

    # find number of observations in dataframe of each station
    df["obs"] = df.groupby(["NAME"])["Year"].transform(len)

    # filter for stations that have more observations than min_obs
    df = df[df["obs"] >= min_obs]

    # split-apply-combine to find the average year-over-year increase in temperature for each station, using the slope of the linear regression model
    coefs = df.groupby(["NAME", "LATITUDE", "LONGITUDE"]).apply(coef)

    # reset multi-indices
    coefs = coefs.reset_index()

    coefs = coefs.rename(columns = {0 : "Estimated Yearly Increase (°C)"})
    coefs["Estimated Yearly Increase (°C)"] = round(coefs["Estimated Yearly Increase (°C)"], 4)
    month_dict = {1 : "January", 2 : "February", 3 : "March", 4 : "April", 5 : "May", 6 : "June", 7 : "July", 8 : "August", 9 : "September", 10 : "October", 11 : "November", 12 : "December"} 

    # create geographic scatterplot, with intensity of color reflecting the station's average yearly temperature increase relative to other stations
    fig = px.scatter_mapbox(coefs,
                        lat = "LATITUDE",
                        lon = "LONGITUDE",
                        hover_name = "NAME",
                        color = "Estimated Yearly Increase (°C)",
                        title = "Estimates of yearly increase in temperature in " + month_dict[month] + " for stations in " + country + ", years " + str(year_begin) + "-" + str(year_end),
                        color_continuous_midpoint = 0,
                        **kwargs)
    fig.update_layout(margin={"r" : 0, "t" : 50, "l" : 0, "b" : 0}, font = dict(size = 10))
    return(fig)

Again, to ensure the function is written correctly, we use sample inputs:

# assumes you have imported necessary packages
color_map = px.colors.diverging.RdGy_r # choose a colormap

fig = temperature_coefficient_plot("India", 1980, 2020, 1, 
                                   min_obs = 10,
                                   zoom = 2,
                                   mapbox_style="carto-positron",
                                   color_continuous_scale=color_map)

fig.show()

Create Two More Interesting Figures

We’d like to answer two additional questions:

  1. Which countries have seen the highest increase in average yearly temperature?
  2. How has temperature increase changed between countries in the northern hemisphere vs. the southern hemisphere?

To answer the first question, let’s use a choropleth. We use the GeoJSON file in the PIC16B directory.

from urllib.request import urlopen
import json

def query_climate_database(year_begin, year_end):
    conn = sqlite3.connect("temps.db")
    cmd = "SELECT * FROM temperatures WHERE Year >= " + str(year_begin) + " AND Year <= " + str(year_end) 
    df = pd.read_sql_query(cmd, conn)
    countries_cmd = "SELECT * FROM countries"
    countries = pd.read_sql_query(countries_cmd, conn)
    conn.close()
    df["FIPS 10-4"] = df["ID"].str[0:2]
    df = pd.merge(df, countries, on = ["FIPS 10-4"])
    df = df.drop(["ID", "FIPS 10-4", "ISO 3166"], axis = 1)
    df = df.set_index(keys=["Name", "Year"])   
    df = df.stack()
    df = df.reset_index()
    df = df.drop(["level_2"], axis = 1)
    df = df.rename(columns = {0 : "Temp"})
    df["Temp"] = df["Temp"] / 100
    return(df)

def coef(data_group):
    x = data_group[["Year"]]
    y = data_group["Temp"]  
    LR = LinearRegression()
    LR.fit(x, y)
    return LR.coef_[0]

def temperature_increase_choropleth_plot(year_begin, year_end, **kwargs):
    # PIC16B directory with GeoJSON data
    countries_gj_url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/countries.geojson"

    with urlopen(countries_gj_url) as response:
        countries_gj = json.load(response)

    df = query_climate_database(year_begin, year_end)
    df = df.groupby(["Name", "Year"]).mean()
    df = df.reset_index()
    coefs = df.groupby(["Name"]).apply(coef)
    coefs = coefs.reset_index()
    coefs = coefs.rename(columns = {0 : "Temperature Increase (°C)",})

    # using a choropleth
    fig = px.choropleth(coefs,
                     geojson = countries_gj,
                     locations = "Name",
                     locationmode = "country names",
                     color = "Temperature Increase (°C)",
                     height = 300,
                     title = "Estimates of yearly increase in temperature across countries, years " + str(year_begin) + "-" + str(year_end),
                     range_color = [-0.1, 0.1],
                     **kwargs)
    fig.update_layout(margin={"r" : 0, "t" : 50, "l" : 0, "b" : 0})
    return(fig)

temperature_increase_plot(1980, 2020)

We see that temperatures have generally increased in Europe, and not as much in North and South America and Asia. Africa sees a wide variety of temperature changes within its countries, and Antarctica sees the highest decrease.

For our second question, let’s use a line plot with a facet.

def query_climate_database(year_begin, year_end):
    conn = sqlite3.connect("temps.db")
    cmd = "SELECT T.ID, T.Year, T.VALUE1, T.VALUE2, T.VALUE3, T.VALUE4, T.VALUE5, T.VALUE6, T.VALUE7, T.VALUE8, T.VALUE9, T.VALUE10, T.VALUE11, T.VALUE12, S.LATITUDE, S.LONGITUDE, S.NAME FROM temperatures T LEFT JOIN stations S on T.ID = S.ID WHERE T.Year >= " + str(year_begin) + " AND T.Year <= " + str(year_end) 
    df = pd.read_sql_query(cmd, conn)
    countries_cmd = "SELECT * FROM countries"
    countries = pd.read_sql_query(countries_cmd, conn)
    conn.close()
    df["FIPS 10-4"] = df["ID"].str[0:2]
    df = pd.merge(df, countries, on = ["FIPS 10-4"])
    df = df.drop(["ID", "FIPS 10-4", "ISO 3166"], axis = 1)
    df = df.set_index(keys=["Year", "LATITUDE", "LONGITUDE", "NAME", "Name"])   
    df = df.stack()
    df = df.reset_index()
    df = df.rename(columns = {"level_5" : "Month", 0 : "Temp"})
    df["Month"] = df["Month"].str[5:].astype(int)
    df["Temp"] = df["Temp"] / 100

    # a station is in the northern hemisphere if its latitude is above 0 and is in the southern hemisphere otherwise
    df["Hemisphere"]= df["LATITUDE"].apply(lambda x: "Northern" if (x > 0) else "Southern") 

    df = df.drop(["NAME", "Name", "LATITUDE", "LONGITUDE"], axis = 1)
    return(df)

def temperature_increase_line_plot(year_begin, year_end, **kwargs):
    df = query_climate_database(year_begin, year_end)
    df = df.groupby(["Year", "Hemisphere", "Month"]).mean()
    df = df.reset_index()

    # using Hemisphere as a facet to track differences in temperature between the two hemispheres
    fig = px.line(df, 
                  x = "Year", 
                  y = "Temp", 
                  color = "Month", 
                  facet_col = "Hemisphere",
                  title = "Change in Yearly Average Temperature in Northern and Southern Hemispheres Across Months, Years " + str(year_begin) + "-" + str(year_end),
                  height = 300,
                  **kwargs)
    fig.update_layout(margin={"r" : 0, "t" : 50, "l" : 0, "b" : 0})
    return(fig)

temperature_increase_line_plot(1980, 2020)

We see that temperatures are rising slowly in the Northern Hemisphere across all months, and are relatively stable in the Southern Hemisphere. As expected, the temperatures of the months are “flipped”, since the Southern Hemisphere has opposite seasons from the Northern Hemisphere.

Updated: