Blog Post 1
In this post, we’re going to create interesting and interactive data visualizations using the sqllite
, pandas
, plotly
, and sklearn
libraries.
Background
The data set we’ll be using, the NOAA-GHCN Temperature Dataset, consists of monthly surface temperature recordings taken at weather stations around the world. Two other data sets we’ll be using contain more information about the countries these stations are located in as well as the stations themselves.
Create a Database
To begin, we’re going to create a database to store our three tables: temperatures
, stations
, and countries
. Download the .csv files used below:
import sqlite3
import pandas as pd
# create a database
conn = sqlite3.connect("temps.db")
# temperatures table
temperatures = pd.read_csv("temps.csv")
# stations table
stations = pd.read_csv("station-metadata.csv")
# countries table
countries = pd.read_csv("countries.csv")
temperatures.to_sql("temperatures", conn, if_exists = "replace", index = False)
stations.to_sql("stations", conn, if_exists = "replace", index = False)
countries.to_sql("countries", conn, if_exists = "replace", index = False)
# closing the connection
conn.close()
Write a Query Function
We write a function, query_climate_database()
, that takes in country
, year_begin
, year_end
, and month
to output temperature readings in country
, from the years year_begin
to year_end
, only for months month
as a Pandas dataframe.
def query_climate_database(country, year_begin, year_end, month):
# connect to database
conn = sqlite3.connect("temps.db")
# select joined temperatures and stations tables with year between year_begin and year_end
cmd = "SELECT T.ID, T.Year, T.VALUE1, T.VALUE2, T.VALUE3, T.VALUE4, T.VALUE5, T.VALUE6, T.VALUE7, T.VALUE8, T.VALUE9, T.VALUE10, T.VALUE11, T.VALUE12, S.LATITUDE, S.LONGITUDE, S.NAME FROM temperatures T LEFT JOIN stations S on T.ID = S.ID WHERE T.Year >= " + str(year_begin) + " AND T.Year <= " + str(year_end)
df = pd.read_sql_query(cmd, conn)
# select countries table with country
countries_cmd = "SELECT * FROM countries WHERE Name = \"" + country + "\""
countries = pd.read_sql_query(countries_cmd, conn)
conn.close()
# countries' FIPS 10-4 is the first two substrings of df's ID
df["FIPS 10-4"] = df["ID"].str[0:2]
# join countries and df on FIPS 10-4
df = pd.merge(df, countries, on = ["FIPS 10-4"])
df = df.drop(["ID", "FIPS 10-4", "ISO 3166"], axis = 1)
# turn columns that won't be stacked into multi-indices
df = df.set_index(keys=["Year", "LATITUDE", "LONGITUDE", "NAME", "Name")
# stack monthly temperature readings
df = df.stack()
# reset multi-indices
df = df.reset_index()
df = df.rename(columns = {"Name" : "Country", "level_5" : "Month" , 0 : "Temp"})
# removing "VALUE" from Month column
df["Month"] = df["Month"].str[5:].astype(int)
# subset of dataframe where Month is month
df = df[df["Month"] == month]
df["Temp"] = df["Temp"] / 100
df = df[["NAME", "LATITUDE", "LONGITUDE", "Country", "Year", "Month", "Temp"]]
df = df.set_index(pd.Index(range(len(df))))
return(df)
Once the function is written, we can pass in sample inputs to ensure it runs correctly.
query_climate_database(country = "India",
year_begin = 1980,
year_end = 2020,
month = 1)
NAME | LATITUDE | LONGITUDE | Country | Year | Month | Temp | |
---|---|---|---|---|---|---|---|
0 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1980 | 1 | 23.48 |
1 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1981 | 1 | 24.57 |
2 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1982 | 1 | 24.19 |
3 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1983 | 1 | 23.51 |
4 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1984 | 1 | 24.81 |
... | ... | ... | ... | ... | ... | ... | ... |
3147 | DARJEELING | 27.050 | 88.270 | India | 1983 | 1 | 5.10 |
3148 | DARJEELING | 27.050 | 88.270 | India | 1986 | 1 | 6.90 |
3149 | DARJEELING | 27.050 | 88.270 | India | 1994 | 1 | 8.10 |
3150 | DARJEELING | 27.050 | 88.270 | India | 1995 | 1 | 5.60 |
3151 | DARJEELING | 27.050 | 88.270 | India | 1997 | 1 | 5.70 |
Write a Geographic Scatter Function for Yearly Temperature Increases
Next, we write the temperature_coefficient_plot()
function, which takes the same arguments as query_climate_database()
, in addition to min_obs
, the minimum number of years temperature recordings were taken for the station to be displayed on our geographic scatterplot, as well as **kwargs
, additional arguments to be passed to scatter_mapbox()
. The function outputs a geographic scatterplot displaying stations in country
, with the color of the station reflecting the average year-over-year change in temperature from year_begin
to year_end
, in month
.
# import plotly express and a library for linear regression
from plotly import express as px
from sklearn.linear_model import LinearRegression
# additional function that outputs the slope of the linear regeression model when regressing temperature on year
def coef(data_group):
x = data_group[["Year"]]
y = data_group["Temp"]
LR = LinearRegression()
LR.fit(x, y)
return LR.coef_[0]
def temperature_coefficient_plot(country, year_begin, year_end, month, min_obs, **kwargs):
# query climate database for input variables to output dataframe
df = query_climate_database(country, year_begin, year_end, month)
# find number of observations in dataframe of each station
df["obs"] = df.groupby(["NAME"])["Year"].transform(len)
# filter for stations that have more observations than min_obs
df = df[df["obs"] >= min_obs]
# split-apply-combine to find the average year-over-year increase in temperature for each station, using the slope of the linear regression model
coefs = df.groupby(["NAME", "LATITUDE", "LONGITUDE"]).apply(coef)
# reset multi-indices
coefs = coefs.reset_index()
coefs = coefs.rename(columns = {0 : "Estimated Yearly Increase (°C)"})
coefs["Estimated Yearly Increase (°C)"] = round(coefs["Estimated Yearly Increase (°C)"], 4)
month_dict = {1 : "January", 2 : "February", 3 : "March", 4 : "April", 5 : "May", 6 : "June", 7 : "July", 8 : "August", 9 : "September", 10 : "October", 11 : "November", 12 : "December"}
# create geographic scatterplot, with intensity of color reflecting the station's average yearly temperature increase relative to other stations
fig = px.scatter_mapbox(coefs,
lat = "LATITUDE",
lon = "LONGITUDE",
hover_name = "NAME",
color = "Estimated Yearly Increase (°C)",
title = "Estimates of yearly increase in temperature in " + month_dict[month] + " for stations in " + country + ", years " + str(year_begin) + "-" + str(year_end),
color_continuous_midpoint = 0,
**kwargs)
fig.update_layout(margin={"r" : 0, "t" : 50, "l" : 0, "b" : 0}, font = dict(size = 10))
return(fig)
Again, to ensure the function is written correctly, we use sample inputs:
# assumes you have imported necessary packages
color_map = px.colors.diverging.RdGy_r # choose a colormap
fig = temperature_coefficient_plot("India", 1980, 2020, 1,
min_obs = 10,
zoom = 2,
mapbox_style="carto-positron",
color_continuous_scale=color_map)
fig.show()
Create Two More Interesting Figures
We’d like to answer two additional questions:
- Which countries have seen the highest increase in average yearly temperature?
- How has temperature increase changed between countries in the northern hemisphere vs. the southern hemisphere?
To answer the first question, let’s use a choropleth. We use the GeoJSON file in the PIC16B directory.
from urllib.request import urlopen
import json
def query_climate_database(year_begin, year_end):
conn = sqlite3.connect("temps.db")
cmd = "SELECT * FROM temperatures WHERE Year >= " + str(year_begin) + " AND Year <= " + str(year_end)
df = pd.read_sql_query(cmd, conn)
countries_cmd = "SELECT * FROM countries"
countries = pd.read_sql_query(countries_cmd, conn)
conn.close()
df["FIPS 10-4"] = df["ID"].str[0:2]
df = pd.merge(df, countries, on = ["FIPS 10-4"])
df = df.drop(["ID", "FIPS 10-4", "ISO 3166"], axis = 1)
df = df.set_index(keys=["Name", "Year"])
df = df.stack()
df = df.reset_index()
df = df.drop(["level_2"], axis = 1)
df = df.rename(columns = {0 : "Temp"})
df["Temp"] = df["Temp"] / 100
return(df)
def coef(data_group):
x = data_group[["Year"]]
y = data_group["Temp"]
LR = LinearRegression()
LR.fit(x, y)
return LR.coef_[0]
def temperature_increase_choropleth_plot(year_begin, year_end, **kwargs):
# PIC16B directory with GeoJSON data
countries_gj_url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/countries.geojson"
with urlopen(countries_gj_url) as response:
countries_gj = json.load(response)
df = query_climate_database(year_begin, year_end)
df = df.groupby(["Name", "Year"]).mean()
df = df.reset_index()
coefs = df.groupby(["Name"]).apply(coef)
coefs = coefs.reset_index()
coefs = coefs.rename(columns = {0 : "Temperature Increase (°C)",})
# using a choropleth
fig = px.choropleth(coefs,
geojson = countries_gj,
locations = "Name",
locationmode = "country names",
color = "Temperature Increase (°C)",
height = 300,
title = "Estimates of yearly increase in temperature across countries, years " + str(year_begin) + "-" + str(year_end),
range_color = [-0.1, 0.1],
**kwargs)
fig.update_layout(margin={"r" : 0, "t" : 50, "l" : 0, "b" : 0})
return(fig)
temperature_increase_plot(1980, 2020)
We see that temperatures have generally increased in Europe, and not as much in North and South America and Asia. Africa sees a wide variety of temperature changes within its countries, and Antarctica sees the highest decrease.
For our second question, let’s use a line plot with a facet.
def query_climate_database(year_begin, year_end):
conn = sqlite3.connect("temps.db")
cmd = "SELECT T.ID, T.Year, T.VALUE1, T.VALUE2, T.VALUE3, T.VALUE4, T.VALUE5, T.VALUE6, T.VALUE7, T.VALUE8, T.VALUE9, T.VALUE10, T.VALUE11, T.VALUE12, S.LATITUDE, S.LONGITUDE, S.NAME FROM temperatures T LEFT JOIN stations S on T.ID = S.ID WHERE T.Year >= " + str(year_begin) + " AND T.Year <= " + str(year_end)
df = pd.read_sql_query(cmd, conn)
countries_cmd = "SELECT * FROM countries"
countries = pd.read_sql_query(countries_cmd, conn)
conn.close()
df["FIPS 10-4"] = df["ID"].str[0:2]
df = pd.merge(df, countries, on = ["FIPS 10-4"])
df = df.drop(["ID", "FIPS 10-4", "ISO 3166"], axis = 1)
df = df.set_index(keys=["Year", "LATITUDE", "LONGITUDE", "NAME", "Name"])
df = df.stack()
df = df.reset_index()
df = df.rename(columns = {"level_5" : "Month", 0 : "Temp"})
df["Month"] = df["Month"].str[5:].astype(int)
df["Temp"] = df["Temp"] / 100
# a station is in the northern hemisphere if its latitude is above 0 and is in the southern hemisphere otherwise
df["Hemisphere"]= df["LATITUDE"].apply(lambda x: "Northern" if (x > 0) else "Southern")
df = df.drop(["NAME", "Name", "LATITUDE", "LONGITUDE"], axis = 1)
return(df)
def temperature_increase_line_plot(year_begin, year_end, **kwargs):
df = query_climate_database(year_begin, year_end)
df = df.groupby(["Year", "Hemisphere", "Month"]).mean()
df = df.reset_index()
# using Hemisphere as a facet to track differences in temperature between the two hemispheres
fig = px.line(df,
x = "Year",
y = "Temp",
color = "Month",
facet_col = "Hemisphere",
title = "Change in Yearly Average Temperature in Northern and Southern Hemispheres Across Months, Years " + str(year_begin) + "-" + str(year_end),
height = 300,
**kwargs)
fig.update_layout(margin={"r" : 0, "t" : 50, "l" : 0, "b" : 0})
return(fig)
temperature_increase_line_plot(1980, 2020)
We see that temperatures are rising slowly in the Northern Hemisphere across all months, and are relatively stable in the Southern Hemisphere. As expected, the temperatures of the months are “flipped”, since the Southern Hemisphere has opposite seasons from the Northern Hemisphere.