# Carefully modify the below two string variables. Ensure there are no typos.

student_id = "12345678" # set this to your student ID

student_mail = "firstname.lastname@student.manchester.ac.uk" # your email address

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

import numpy as np
import pandas as pd
import seaborn as sns

df = pd.read_csv("_datasets/UK_houseprices2026.csv")

sales = None
avgPrice = None

# TODO: Provide your solution code here that defines `sales` and `avgPrice`

# Solution
sales = df[["Date", "Region", "Mean Price", "Sales Volume", "New Price", "New Sales Volume", "Old Price", "Old Sales Volume"]]
avgPrice = sales["Mean Price"].dropna()

mismatchCount = None
mismatchRegions = None

# TODO: Provide your solution code here that defines `mismatchCount` and `mismatchRegions`

# Solution
mismatch = (df["Sales Volume"] - df["New Sales Volume"] - df["Old Sales Volume"]).abs() >= 0.5
mismatchCount = mismatch.sum()
mismatchRegions = df[mismatch]['Region'].unique() 

print(mismatchCount, mismatchRegions)

75 <StringArray>
[          'Inner London', 'North Northamptonshire',       'Northern Ireland',
         'United Kingdom',  'West Northamptonshire']
Length: 5, dtype: str

sales_clean = None

# TODO: Provide your solution code here

# Solution
sales_clean = df.dropna()
ind = sales_clean['Region'].isin({'United Kingdom', 'England', 'Scotland', 'Wales', 'Northern Ireland'})
sales_clean = sales_clean[~ind]

sales_clean

soldPerRegion = None

# TODO: Provide your solution code here

# Solution
soldPerRegion = df[['Region','Sales Volume']].groupby('Region').sum()['Sales Volume']
soldPerRegion

Region
Aberdeenshire                52333.0
Adur                         11978.0
Amber Valley                 26016.0
Angus                        24155.0
Antrim and Newtownabbey      25863.0
                              ...   
Wychavon                     25462.0
Wyre                         24007.0
Wyre Forest                  17783.0
York                         37919.0
Yorkshire and The Humber    954733.0
Name: Sales Volume, Length: 405, dtype: float64

annualAvg = None

# TODO: Provide your solution code here

# Solution
df_Date = df.copy()

df_Date = df_Date.loc[df_Date['Region']=='Greater Manchester'][['Date','Mean Price']]
df_Date['Date'] = pd.to_datetime(df_Date['Date'])
df_Date = df_Date.set_index(df_Date['Date'])
annualAvg = df_Date.groupby(df_Date['Date'].dt.year).mean()['Mean Price']

annualAvg

Date
2014    122045.666667
2015    128763.333333
2016    137953.250000
2017    147450.500000
2018    154871.083333
2019    160932.583333
2020    167812.250000
2021    191004.083333
2022    212646.416667
2023    216807.083333
2024    222817.083333
2025    232475.916667
Name: Mean Price, dtype: float64

detached2024 = None
detached2024_expensive = None

# TODO: Provide your solution code here

# Solution
ind = pd.to_datetime(df['Date']).dt.year==2024
detached2024 = df[ind]
detached2024 = detached2024.groupby('Region')['Detached Price'].median()
ind = detached2024 > detached2024.quantile(0.95)
detached2024_expensive = detached2024[ind]

detached2024_expensive

Region
Barnet                    1627087.5
Brent                     1325065.5
Camden                    3462403.0
City of Westminster       4368453.5
Ealing                    1350946.5
Elmbridge                 1504700.0
Enfield                   1204151.0
Hackney                   1324363.0
Hammersmith and Fulham    1829783.0
Haringey                  2220648.0
Harrow                    1180033.0
Inner London              1857506.5
Islington                 1670521.0
Kensington and Chelsea    4760293.0
Kingston upon Thames      1235464.5
Merton                    2198556.5
Richmond upon Thames      1703265.5
Southwark                 1710689.5
St Albans                 1181955.0
Wandsworth                2496095.5
Name: Detached Price, dtype: float64

p_gm = None

# TODO: Provide your solution code here

# Solution
gm_value = detached2024['Greater Manchester']
p_gm = (detached2024.dropna() < gm_value).mean() 

p_gm

np.float64(0.4122137404580153)

df_hpi = None

# TODO: Provide your solution code here

# Solution
df_hpi = df.copy()

# Filter the dataframe to get the prices on 1st January 2015
base_prices = df_hpi[df_hpi['Date'] == '2015-01-01'][['Region', 'Mean Price']].set_index('Region')

# Calculate the HPI for each row
df_hpi = df_hpi.set_index('Region')
df_hpi['HPI'] = df_hpi.apply(lambda row: (row['Mean Price'] / base_prices.loc[row.name,'Mean Price'])*100, axis=1).values

# Reset the index to bring RegionName back as a column
df_hpi = df_hpi.reset_index()

cheapest_month = None

# TODO: Provide your solution code here

# Solution

df2 = df[['Date','Region','Flat Price']]
df2 = df2[~df2['Flat Price'].isna()]

# Extract year and month from the Date column
df2['Year'] = pd.to_datetime(df2['Date']).dt.year
df2['Month'] = pd.to_datetime(df2['Date']).dt.month

# Find the month with the cheapest FlatPrice for each year and region
cheapest_months = df2.loc[df2.groupby(['Year', 'Region'])['Flat Price'].idxmin().dropna()]

# Find the most frequent month overall
cheapest_month = cheapest_months['Month'].mode()[0]

cheapest_month

np.int32(1)

try: 
    import re
    assert re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', student_mail) and not 'firstname' in student_mail
    print("OKAY - student_mail appears to be valid")
except:
    print("WARN - student_mail could not be verified")

try: 
    df_reload = pd.read_csv("_datasets/UK_houseprices2026.csv")
    assert df_reload.compare(df).empty
    print("OKAY - dataframe df has not been modified")
except:
    print("WARN - dataframe df has been modified")

try: 
    assert type(sales) == pd.DataFrame
    print("OKAY - sales should be a pandas dataframe")
except:
    print("FAIL - sales should be a pandas dataframe")

try: 
    assert type(avgPrice) == pd.Series
    print("OKAY - avgPrice should be a pandas series")
except:
    print("FAIL - avgPrice should be a pandas series")

try: 
    assert isinstance(mismatchCount, (int, np.integer))
    print("OKAY - mismatchCount should be an int")
except:
    print("FAIL - mismatchCount should be an int")

try: 
    assert isinstance(mismatchRegions, (np.ndarray, list, pd.arrays.StringArray))
    print("OKAY - mismatchRegions should be an array")
except:
    print("FAIL - mismatchRegions should be an array")

try:
    student_mail; student_id; df; sales; avgPrice; mismatchCount; mismatchRegions; sales_clean; 
    soldPerRegion; annualAvg; detached2024; detached2024_expensive; p_gm; df_hpi; cheapest_month
    print("OKAY - required variables exist in memory")
except:
    print("WARN - at least one of the required variables is not defined. check spelling?")

WARN - student_mail could not be verified
OKAY - dataframe df has not been modified
OKAY - sales should be a pandas dataframe
OKAY - avgPrice should be a pandas series
OKAY - mismatchCount should be an int
OKAY - mismatchRegions should be an array
OKAY - required variables exist in memory

	Date	Region	Mean Price	Sales Volume	New Price	New Sales Volume	Old Price	Old Sales Volume	Detached Price	Semi-Detached Price	Terraced Price	Flat Price	Cash Price	Cash Sales Volume	Mortgage Price	Mortgage Sales Volume	FTB Price	FOO Price
0	2014-01-01	Aberdeenshire	192239	411.0	260142.0	62.0	184548.0	349.0	276808.0	168052.0	141761.0	109450.0	179333.0	111.0	199310.0	300.0	150307.0	226869.0
1	2014-02-01	Aberdeenshire	191558	347.0	259718.0	84.0	183857.0	263.0	275641.0	167303.0	141339.0	109561.0	178732.0	79.0	198590.0	268.0	150054.0	225868.0
2	2014-03-01	Aberdeenshire	187155	383.0	253922.0	86.0	179592.0	297.0	270184.0	163266.0	137690.0	106211.0	174452.0	111.0	194095.0	272.0	146308.0	220887.0
3	2014-04-01	Aberdeenshire	189000	513.0	256803.0	100.0	181342.0	413.0	271913.0	165402.0	139679.0	107314.0	176207.0	121.0	195993.0	392.0	148189.0	222767.0
4	2014-05-01	Aberdeenshire	192885	585.0	263044.0	103.0	185020.0	482.0	277360.0	169069.0	142767.0	109078.0	179742.0	140.0	200057.0	445.0	150990.0	227513.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
58313	2025-06-01	Yorkshire and The Humber	201490	5015.0	292181.0	261.0	198291.0	4754.0	337742.0	207454.0	162373.0	125473.0	198786.0	1130.0	202679.0	2987.0	173934.0	240595.0
58314	2025-07-01	Yorkshire and The Humber	204361	5473.0	305374.0	34.0	200625.0	5439.0	342020.0	210223.0	164958.0	127510.0	201066.0	1193.0	205812.0	3183.0	176508.0	243854.0
58315	2025-08-01	Yorkshire and The Humber	206107	5471.0	302627.0	19.0	202629.0	5452.0	343290.0	212745.0	166495.0	128073.0	202747.0	1116.0	207587.0	2913.0	178185.0	245644.0
58316	2025-09-01	Yorkshire and The Humber	206581	4745.0	306535.0	18.0	202921.0	4727.0	346628.0	212622.0	166663.0	127992.0	203692.0	748.0	207852.0	1663.0	178353.0	246632.0
58317	2025-10-01	Yorkshire and The Humber	206401	4961.0	313072.0	8.0	202381.0	4953.0	344502.0	213475.0	166443.0	127278.0	203114.0	440.0	207848.0	1061.0	178864.0	245256.0

Coursework 1¶

Note on independent work¶

Start of test¶

Problem 1¶

Problem 2¶

Problem 3¶

Problem 4¶

Problem 5¶

Problem 6¶

Problem 7¶

Problem 8¶

Problem 9¶

End of test¶