import re from collections import namedtuple
I am going to build desired regular expression to extract coordinates from text based on coordinates order (lat-lon or lon-lat), coordinates formats and separator character that separates latitude and longitude within coordinate pair not one coordinate pair from another one.
# Coordinates order ORDER_LATLON = 'LATLON' ORDER_LONLAT = 'LONLAT' # Coordinate format constants DMSH_COMP = 'DMSH_COMP' HDMS_COMP = 'HDMS_COMP' DMSH_SEP = 'DMSH_SEP' HDMS_SEP = 'HDMS_SEP' # Separators constants - separates latitude and longitude in pair, not pairs! SEP_NULL = r'' SEP_SPACE = r' ' SEP_HYPHEN = r'-' SEP_SLASH = r'/' SEP_BACKSLASH = r'\\'
Now, it's time to define named tuple - coordinate pair. It will be convenient to used this tuple in dictionary with patterns for regex:
coord_pair = namedtuple('coord_pair', 'lat lon')
And dictionary with regex patterns:
coord_formats = {DMSH_COMP: coord_pair(r'\d{6}\.\d+[NS]|\d{6}[NS]', r'\d{7}\.\d+[EW]|\d{7}[EW]'), HDMS_COMP: coord_pair(r'[NS]\d{6}\.\d+|[NS]\d{6}', r'[EW]\d{7}\.\d+|[EW]\d{7}'), DMSH_SEP: coord_pair(r'''\d{1,2}\W\d{2}\W\d{1,2}\.\d+\W{1,2}[NS]|\d{1,2}\W\d{1,2}\W\d{1,2}\W{1,2}[NS]''', r'''\d{1,3}\W\d{1,2}\W\d{1,2}\.\d+\W{1,2}[EW]|\d{1,3}\W\d{1,2}\W\d{1,2}\W{1,2}[EW]'''), HDMS_SEP: coord_pair(r'''[NS]\d{1,2}\W\d{2}\W\d{1,2}\.\d+\W{1,2}|[NS]\d{1,2}\W\d{1,2}\W\d{1,2}\W{1,2}''', r'''[EW]\d{1,3}\W\d{1,2}\W\d{1,2}\.\d+\W{1,2}|[EW]\d{1,3}\W\d{1,2}\W\d{1,2}\{1,2}W''')}
Notice that for DMSH_SEP format I used \W special character which matches any character which is not a word character. I make an assumption here that DMS format is separated by degrees, minutes and seconds symbols such as in examples:
85°52'55"W, 33°45'14"N, 48°54'04.05"N, 002°21'10"E
It is simple way to avoid using of special characters for degrees symbol for example and issue with different encodings. (utf-8, ASCII etc.). Because it is not a 'strict' way to check if given text is in DMS separated format, it may lead to unexpected result.
class CoordRegexBuilder: def __init__(self, coord_order, coord_format, coord_sep): self.coord_order = coord_order self.coord_format = coord_format self.coord_sep = coord_sep self.coord_regex_str = self.create_regex_str()
Methods that create regular expression string(pattern) and compile it into regex object:
def create_regex_str(self): regex_str = '' lat_format = coord_formats.get(self.coord_format).lat sep_format = self.coord_sep lon_format = coord_formats.get(self.coord_format).lon if self.coord_order == ORDER_LATLON: regex_str = r'(?P<lat>' + lat_format + ')' + \ '(?P<sep>' + sep_format + ')' + \ '(?P<lon>' + lon_format + ')' elif self.coord_order == ORDER_LONLAT: regex_str = r'(?P<lon>' + lon_format + ')' + \ '(?P<sep>' + sep_format + ')' + \ '(?P<lat>' + lat_format + ')' return regex_str def get_coord_regex(self): return re.compile(self.coord_regex_str)
No comments:
Post a Comment