Skip to content

023 Plotting Graphs : Answers to exercises

Exercise 1

We have seen how to access the dataset labels using:

headings = df.columns[1:-1]
  • Copy the code to read the HadSEEP monthly datasets above
  • Write and run code that plots the precipitation data for all months separate subplots.
# ANSWER 
# Copy the code to read the HadSEEP monthly datasets above
import pandas as pd
from urlpath import URL
from pathlib import Path

# Monthly Southeast England precipitation (mm) 
site = 'https://www.metoffice.gov.uk/'
site_dir = 'hadobs/hadukp/data/monthly'
site_file = 'HadSEEP_monthly_totals.txt'

url = URL(site,site_dir,site_file)

r = url.get()
if r.status_code == 200:
    # setup Path object for output file
    filename = Path('work',url.name)
    # write text data
    filename.write_text(r.text)
    # check size and report
    print(f'file {filename} written: {filename.stat().st_size} bytes')

    df=pd.read_table(filename,**panda_format)
    # df.head: first n lines
    ok= True
else:
    print(f'failed to get {url}')

panda_format = {
    'skiprows'   :  3,
    'na_values'  :  [-99.9],
    'sep'        :  r"[ ]{1,}",
    'engine'     :  'python'
}

df=pd.read_table(filename,**panda_format)

# df.head: first n lines
df.head()
file work/HadSEEP_monthly_totals.txt written: 15209 bytes
Year Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec Annual
0 1873 87.1 50.4 52.9 19.9 41.1 63.6 53.2 56.4 62.0 86.0 59.4 15.7 647.7
1 1874 46.8 44.9 15.8 48.4 24.1 49.9 28.3 43.6 79.4 96.1 63.9 52.3 593.5
2 1875 96.9 39.7 22.9 37.0 39.1 76.1 125.1 40.8 54.7 137.7 106.4 27.1 803.5
3 1876 31.8 71.9 79.5 63.6 16.5 37.2 22.3 66.3 118.2 34.1 89.0 162.9 793.3
4 1877 146.0 47.7 56.2 66.4 62.3 24.9 78.5 82.4 38.4 58.1 144.5 54.2 859.6
# ANSWER 2
# Write and run code that plots the 
# precipitation data for all months separate subplots.
import matplotlib.pyplot as plt

# plot size > in y
# need to play with this to get it right
x_size,y_size = 20,30

# get the m onth names from columns
months = df.columns[1:-1]

fig, axs = plt.subplots(12,1,figsize=(x_size,y_size))

# use enumerate in the loop, to get the index
for i,m in enumerate(months):
    # plot y-data and set the label for the first panel
    axs[i].plot(df["Year"],df[m],'k',label=m)
    axs[i].set_ylabel(f'{m} Precipitation (mm)')
    axs[i].set_xlim(year0,year1)

# x-label
_=axs[-1].set_xlabel(f'year')

png

Exercise 2

  • Read the 2276931.csv dataset into a pandas dataframe called df
  • Convert the field df["DATE"] to a list called dates
  • Use your understanding of datetime to convert the data dates[0] to a datetime object called start_date
  • Convert the data date[-1] to a datetime object called end_date
  • Find how many days between start_date and end_date
  • Use a loop structure to convert the all elements in dates to be the n umber of days after the start date
# ANSWER
# Read the `2276931.csv` dataset into a 
# pandas dataframe called `df`
import pandas as pd
from urlpath import URL
from pathlib import Path

site = 'https://raw.githubusercontent.com'
site_dir = '/UCL-EO/geog0111/master/notebooks/data'
site_file = '2276931.csv'

# form the URL
url = URL(site,site_dir,site_file)

r = url.get()
if r.status_code == 200:
    # setup Path object for output file
    filename = Path('work',url.name)
    # write text data
    filename.write_text(r.text)
    # check size and report
    print(f'file {filename} written: {filename.stat().st_size} bytes')

    df=pd.read_table(filename,**panda_format)
    # df.head: first n lines
    ok= True
else:
    print(f'failed to get {url}')

# Read the file into pandas using url.open('r').
df=pd.read_csv(filename)

# print the first 5 lines of data
df.head(5)
file work/2276931.csv written: 15078 bytes
STATION NAME DATE PRCP SNOW
0 US1FLGD0002 HAVANA 4.2 SW, FL US 2020-01-01 0.00 0.0
1 US1FLGD0002 HAVANA 4.2 SW, FL US 2020-01-02 0.00 0.0
2 US1FLGD0002 HAVANA 4.2 SW, FL US 2020-01-03 0.00 0.0
3 US1FLGD0002 HAVANA 4.2 SW, FL US 2020-01-04 0.98 NaN
4 US1FLGD0002 HAVANA 4.2 SW, FL US 2020-01-05 0.00 0.0
from datetime import datetime
# ANSWER
# Convert the field `df["DATE"]` to 
# a list called `dates`
dates = list(df["DATE"])

# Use your understanding of `datetime` to convert 
# the data `dates[0]` to a `datetime` object called `start_date`
# use datetime.strptime(d,"%Y-%m-%d") to read a date in the format 2020-09-02
start_date = datetime.strptime(dates[0], "%Y-%m-%d")
print(f'{dates[0]} -> {start_date}')

# Convert the data `date[-1]` to a 
# `datetime` object called `end_date`
end_date = datetime.strptime(dates[-1], "%Y-%m-%d")
print(f'{dates[-1]} -> {end_date}')

# find how many days between start_date and end_date
# ndays is number of days in date minus start date
ndays = (end_date - start_date).days
print(f'ndays: {start_date} to {end_date}: {ndays}')

# Use a loop structure to convert the all 
# elements in `dates` to be the number of days after the start date
ndays = [(datetime.strptime(d,"%Y-%m-%d")-start_date).days for d in dates]
print(ndays)
2020-01-01 -> 2020-01-01 00:00:00
2020-09-02 -> 2020-09-02 00:00:00
ndays: 2020-01-01 00:00:00 to 2020-09-02 00:00:00: 245
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245]

Exercise 3

We examined a pulsar time series in a previous section of notes. It contains the successive pulses of the oscillation signal coming from the Pulsar PSR B1919+21 discovered by Jocelyn Bell in 1967.

The dataset as presented contains samples in columns, so that sample 0 is df[0], up to df[79] (80 samples).

  • Plot the pulsar samples in a series of 80 sub-plots.

Advice:

For the figure, do not label the axes as it will get too cluttered. In any professional figure of that sort, you would need to explain the axes in accompanying text.

For further 'effects' consider switching off the plotting of axes in each subplot, with:

ax.axis('off')

for axis ax (this may be something like axs[i] in your code).

The results should be reminiscent of:

pulsar image

and

Joy Division

If you want to go further towards re-creating this, you consult the matplotlib gallery for ideas.

# ANSWER 1

import pandas as pd
from urlpath import URL
from pathlib import Path

site = 'https://raw.githubusercontent.com'
site_dir = 'igorol/unknown_pleasures_plot/master'
site_file = 'pulsar.csv'

url = URL(site,site_dir,site_file)

r = url.get()
if r.status_code == 200:
    # setup Path object for output file
    filename = Path('work',url.name)
    # write text data
    filename.write_text(r.text)
    # check size and report
    print(f'file {filename} written: {filename.stat().st_size} bytes')

    df=pd.read_table(filename,**panda_format)
    # df.head: first n lines
    ok= True
else:
    print(f'failed to get {url}')

# transposed version
df=pd.read_csv(filename,header=None).transpose()
df
file work/pulsar.csv written: 130465 bytes
0 1 2 3 4 5 6 7 8 9 ... 70 71 72 73 74 75 76 77 78 79
0 -0.81 -0.61 -1.43 -1.09 -1.13 -0.66 -0.36 -0.73 -0.89 -0.69 ... 0.00 -0.16 0.19 -0.32 -0.16 0.62 0.32 -0.09 0.11 0.12
1 -0.91 -0.40 -1.15 -0.85 -0.98 -0.89 -0.21 -0.83 -0.61 -0.54 ... -0.12 -0.15 0.06 -0.83 -0.26 0.64 0.31 -0.14 0.05 -0.12
2 -1.09 -0.42 -1.25 -0.72 -0.93 -0.87 -0.44 -0.91 -0.74 -0.84 ... 0.10 0.25 -0.27 -0.69 -0.36 0.59 0.28 -0.24 0.05 -0.12
3 -1.00 -0.38 -1.13 -0.74 -0.90 -0.87 -0.20 -1.10 -0.85 -0.89 ... -0.01 0.37 -0.11 -0.80 -0.49 0.30 0.42 -0.24 -0.05 -0.45
4 -0.59 -0.55 -0.76 -0.26 -1.14 -1.07 -0.31 -0.87 -0.77 -0.45 ... -0.15 -0.13 0.09 -0.76 0.00 0.01 -0.24 -0.66 -0.03 -0.24
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
295 -0.26 -0.83 0.11 -1.03 -0.29 -0.55 -1.45 -1.20 -0.94 -0.16 ... 0.47 0.10 -0.06 0.08 0.28 -0.21 -0.56 -0.12 -0.87 0.13
296 -0.52 -0.80 -0.77 -0.78 -0.54 -0.62 -0.77 -1.40 -1.05 0.24 ... 0.41 0.02 -0.08 -0.15 -0.01 -0.09 -0.50 0.29 -1.31 0.09
297 -0.44 -0.47 -0.88 -0.40 -0.65 -0.71 0.03 -0.51 -0.51 -0.17 ... 0.32 -0.10 -0.04 0.03 -0.67 -0.24 -0.38 -0.02 -1.02 -0.01
298 -0.58 -0.13 -0.45 0.18 -0.64 -0.88 0.47 0.25 -0.47 -0.09 ... 0.57 -0.16 0.23 0.03 -0.86 -0.17 -0.58 0.21 -1.10 -0.03
299 -0.54 -0.12 -1.01 0.27 -0.94 -0.70 1.33 0.74 -0.79 0.01 ... 0.48 -0.06 -0.10 -0.54 -1.66 -0.62 -0.43 0.44 -1.62 -0.23

300 rows × 80 columns

# ANSWER 2
# Plot the pulsar samples in a series of 80 sub-plots.
import matplotlib.pyplot as plt

# need to play with this to get it right
x_size,y_size = 15,20

# get the m onth names from columns
samples = df.columns
fig,axs = plt.subplots(len(df.columns),1,figsize=(x_size,y_size))

# use enumerate in the loop, to get the index
for i,m in enumerate(samples):
    axs[i].plot(df[m],'k')
    axs[i].axis('off')

png


Last update: October 1, 2021