Skip to content

Commit

Permalink
Fix bugs in Stations
Browse files Browse the repository at this point in the history
  • Loading branch information
mikeqfu committed Jun 10, 2023
1 parent 83e311b commit 0b5062b
Showing 1 changed file with 101 additions and 62 deletions.
163 changes: 101 additions & 62 deletions pyrcs/other_assets/station.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@
from pyhelpers.dirs import cd
from pyhelpers.ops import fake_requests_headers
from pyhelpers.store import load_data
from pyhelpers.text import remove_punctuation

from ..parser import get_catalogue, get_last_updated_date, parse_tr
from ..utils import cd_data, collect_in_fetch_verbose, format_err_msg, home_page_url, init_data_dir, \
is_home_connectable, print_conn_err, print_inst_conn_err, print_void_msg, save_data_to_file, \
validate_initial
from pyrcs.parser import get_catalogue, get_last_updated_date, parse_tr
from pyrcs.utils import cd_data, collect_in_fetch_verbose, format_err_msg, home_page_url, \
init_data_dir, is_home_connectable, print_conn_err, print_inst_conn_err, print_void_msg, \
save_data_to_file, validate_initial


class Stations:
Expand Down Expand Up @@ -188,6 +189,38 @@ def _cdd(self, *sub_dir, **kwargs):

return path

@staticmethod
def split_elr_mileage_column(dat):
if 'ELRMileage' in dat.columns:
temp = dat['ELRMileage'].str.split(r'\t\t / | / \[', n=1, regex=True, expand=True)
temp.columns = ['ELR', 'Mileage']

# elr_dat = temp['ELR'].str.strip().str.replace(' ', ' &&& ')
# elr_dat = temp['ELR'].str.strip().str.replace(' / ', ' &&& ')

def _to_repl(x, repl=' &&& '):
if ' / ' in x:
y = x.replace(' / ', repl)
elif ' / ' in x:
y = x.replace(' / ', repl)
elif ' ' in x:
y = x.replace(' ', repl)
else:
y = x
return y

elr_dat = temp['ELR'].str.strip().map(_to_repl)

temp['Mileage'] = temp['Mileage'].str.split('\t\t / ').fillna('').map(
lambda x: [' / '.join(x_.strip(' []').split(' ')) for x_ in x])
mil_dat = temp['Mileage'].map(lambda x: x[0] if len(x) == 1 else ' &&& '.join(x))

dat.drop(columns=['ELRMileage'], inplace=True)
dat.insert(1, 'ELR', elr_dat)
dat.insert(2, 'Mileage', mil_dat)

return dat

@staticmethod
def check_row_spans(dat):
"""
Expand All @@ -199,34 +232,35 @@ def check_row_spans(dat):
:rtype: pandas.DataFrame
"""

temp0 = dat['Degrees Longitude'].str.split(' / ')
temp1 = dat[temp0.map(len).map(lambda x: True if x > 1 else False)]
# temp0 = dat['Degrees Longitude'].str.split(' / ')
temp0 = dat['Degrees Longitude'].str.split(r' / |\r', regex=True)
temp1 = dat[temp0.map(lambda x: True if len(x) > 1 else False)]

cols = ['ELR', 'Mileage', 'Degrees Longitude', 'Degrees Latitude', 'Grid Reference']
cols_ = [x for x in temp1.columns if x not in cols]

temp_dat = []
for col in cols:
# noinspection PyUnresolvedReferences
temp2 = temp1[col].map(lambda x: x.split(' / '))
temp2 = temp1[col].str.split(r' &&& |\r| / ', regex=True)

if cols.index(col) == 0:
# noinspection PyTypeChecker
temp_dat_ = temp1[cols_].join(temp2).explode(col)
temp_dat_.index = range(len(temp_dat_))
temp_dat_ = temp1[cols_].join(temp2).explode(col, ignore_index=True)
else:
temp_dat_ = temp2.explode(col).to_frame()
temp_dat_ = temp2.explode(ignore_index=True).to_frame(name=col)

temp_dat.append(temp_dat_)

temp_data = pd.concat(temp_dat, axis=1)
temp_data = temp_data[dat.columns.to_list()]

dat = pd.concat([dat.drop(index=temp1.index), temp_data], axis=0, ignore_index=True)
dat0 = pd.concat([dat.drop(index=temp1.index), temp_data], axis=0, ignore_index=True)

dat.sort_values(['Station'], ignore_index=True, inplace=True)
dat0[['ELR', 'Mileage']] = dat0[['ELR', 'Mileage']].applymap(lambda x: x.split(' &&& '))
dat0 = dat0.explode(['ELR', 'Mileage'], ignore_index=True)

return dat
dat0.sort_values(['Station'], ignore_index=True, inplace=True)

return dat0

@staticmethod
def parse_coordinates_columns(dat):
Expand Down Expand Up @@ -262,21 +296,25 @@ def parse_station_column(dat):
x = 'Heathrow Junction [sometimes referred to as Heathrow Interchange]\t\t / [no CRS?]'
"""

temp1 = dat['Station'].str.split('\t\t', expand=True)
temp1 = dat['Station'].str.split('\t\t', n=1, expand=True)
temp1.columns = ['Station', 'CRS']
dat['Station'] = temp1['Station']
dat['Station'] = temp1['Station'].str.rstrip(' / ').str.strip()

# Get notes for stations
stn_note_ = pd.Series('', index=dat.index)
for i, x in enumerate(temp1['Station']):
if '[' in x and ']':
y = re.search(r' \[(.*)]', x).group(0) # Station Note
dat.loc[i, 'Station'] = x.replace(y, '')
stn_note_[i] = y.strip(' []')
if '[' in x and ']' in x:
y = re.search(r' \[(.*)](✖.*)?', x).group(0) # Station Note
dat.loc[i, 'Station'] = x.replace(y, '').strip()
if '✖' in y:
stn_note_[i] = '; '.join([y_.strip(' []') for y_ in y.split('✖')])
else:
stn_note_[i] = y.strip(' []')

dat.insert(loc=dat.columns.get_loc('Station') + 1, column='Station Note', value=stn_note_)

temp2 = temp1['CRS'].str.replace(' / /', ' &&& ').str.split(' / ', expand=True).fillna('')
temp2 = temp1['CRS'].str.replace(' / /', ' &&& ').str.split(
r' | / ', regex=True, expand=True).fillna('')

if temp2.shape[1] == 1:
temp2.columns = ['CRS']
Expand All @@ -287,14 +325,14 @@ def parse_station_column(dat):

temp2['CRS'] = temp2['CRS'].str.replace(r'[()]', '', regex=True).map(
lambda z: ' and '.join(['{} [{}]'.format(*z_.split('✖')) for z_ in z.split(' &&& ')])
if ' &&& ' in z else z)
if ' &&& ' in z else z).str.strip()

dat = pd.concat([dat, temp2], axis=1)

return dat

@staticmethod
def _parse_owner_and_operator(x, sep=' / '):
def _parse_owner_and_operator(x):
"""
x = dat['Owner'][0]
x = dat['Owner'][1]
Expand All @@ -303,13 +341,13 @@ def _parse_owner_and_operator(x, sep=' / '):
if ' / and / ' in x:
y, y_ = x.replace(' / and / ', ' &&& '), ''

elif ' / ' in x:
x_ = x.split(sep)
elif ' / ' in x or '\r' in x:
x_ = re.split(r' / |\r', x)

# y - Owners or operators; y_ - Former owners or operators
if len(x_) > 1:
y = x_[0]
y_ = x_[1] if len(x_[1:]) == 1 else sep.join(x_[1:])
y_ = x_[1] if len(x_[1:]) == 1 else ' / '.join(x_[1:])
else:
y, y_ = x_[0], ''

Expand All @@ -319,6 +357,9 @@ def _parse_owner_and_operator(x, sep=' / '):
if '✖' in y and ' &&& ' in y:
y = ' and '.join(['{} [{}]'.format(*z.split('✖')) for z in y.split(' &&& ')])

if ' [from' in y:
y = remove_punctuation(y)

return y, y_

def parse_owner_and_operator_columns(self, dat):
Expand All @@ -334,40 +375,15 @@ def parse_owner_and_operator_columns(self, dat):
owner_operator = []
for col in ['Owner', 'Operator']:
temp = pd.DataFrame(
dat[col].map(self._parse_owner_and_operator).to_list(), columns=[col, 'Former ' + col])
dat[col].map(self._parse_owner_and_operator).to_list(), columns=[col, 'Former ' + col],
index=dat.index)
del dat[col]
owner_operator.append(temp)

dat = pd.concat([dat] + owner_operator, axis=1)

return dat

@staticmethod
def parse_elr_mileage_columns(dat):
"""
Parse ``'ELR'`` and ``'Mileage'`` of the station locations data.
:param dat: preprocessed data of the station locations
:type dat: pandas.DataFrame
:return: data with parsed ``'ELR'`` and ``'Mileage'``
:rtype: pandas.DataFrame
"""

dat['Mileage'] = dat['Mileage'].map(lambda x: ']'.join(x.replace(' / (', ' [').rsplit(')', 1)))

em_col_names = ['ELR', 'Mileage']
dat[em_col_names] = dat[em_col_names].applymap(
lambda x: x.replace(' / / ', ' / ').replace(' / [', ' [').split(' / ')
if ' / ' in x else x)

# Where the Mileage data indicates the start and end
idx = dat[dat[em_col_names].apply(
lambda x: len(x.Mileage) != len(x.ELR) and isinstance(x.Mileage, list),
axis=1)].index
dat.loc[idx, 'Mileage'] = dat.loc[idx, 'Mileage'].map(lambda x: ' - '.join(x))

return dat

def collect_locations_by_initial(self, initial, update=False, verbose=False):
"""
Collect `data of railway station locations
Expand Down Expand Up @@ -447,7 +463,7 @@ def collect_locations_by_initial(self, initial, update=False, verbose=False):

data = {beginning_with: None, self.KEY_TO_LAST_UPDATED_DATE: None}

# url = stn.URL.replace('station0', 'station{}'.format(beginning_with.lower()))
# url = stn.URL.replace('station0', 'station{}'.format(initial_))
url = self.URL.replace('station0', 'station{}'.format(initial_))
try:
source = requests.get(url=url, headers=fake_requests_headers())
Expand All @@ -464,31 +480,54 @@ def collect_locations_by_initial(self, initial, update=False, verbose=False):
if any(x is None for x in {thead, tbody}):
if verbose == 2:
print(f"There are no stations starting with '{beginning_with}'.")
# f"There are no British towns starting with '{beginning_with}'.

else:
# Create a DataFrame of the requested table
trs = tbody.find_all(name='tr')
ths = [re.sub(r'\n?\r+\n?', ' ', h.text).strip() for h in thead.find_all('th')]
dat = parse_tr(trs=trs, ths=ths, as_dataframe=True)
dat_ = parse_tr(trs=trs, ths=ths, as_dataframe=True)

dat_ = dat.copy()
dat = dat_.copy()

parser_funcs = [
self.split_elr_mileage_column,
self.check_row_spans,
self.parse_coordinates_columns,
self.parse_station_column,
self.parse_owner_and_operator_columns,
self.parse_elr_mileage_columns,
]
for parser_func in parser_funcs:
dat_ = parser_func(dat_)
dat = parser_func(dat)

# # Debugging
# for parser_func in parser_funcs:
# try:
# dat_ = parser_func(dat_)
# except Exception:
# print(parser_func)
# break

# Explode by ELR and Mileage
dat_ = dat_.explode(column=['ELR', 'Mileage'], ignore_index=True)
dat = dat.explode(column=['ELR', 'Mileage'], ignore_index=True)

errata_ = {
"-By-": "-by-",
"-In-": "-in-",
"-En-Le-": "-en-le-",
"-La-": "-la-",
"-Le-": "-le-",
"-On-": "-on-",
"-The-": "-the-",
" Of ": " of ",
"-Super-": "-super-",
"-Upon-": "-upon-",
"-Under-": "-under-",
"-Y-": "-y-",
}
dat['Station'].replace(errata_, regex=True, inplace=True)

data = {
beginning_with: dat_.sort_values('Station', ignore_index=True),
beginning_with: dat.sort_values('Station', ignore_index=True),
self.KEY_TO_LAST_UPDATED_DATE: get_last_updated_date(url=url, parsed=True)
}

Expand Down

0 comments on commit 0b5062b

Please sign in to comment.