# -*- coding: utf-8 -*- import pytz import re import sys from curses import ascii from dateutil import parser from datetime import datetime from urlparse import urlparse from xml.sax.saxutils import escape class DEFAULT: def __repr__(self): return "" DEFAULT = DEFAULT() def utc_date_to_string(utc_date): return utc_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ") def string_to_utc_date(from_date=None): if from_date is None: return datetime.utcnow().replace(tzinfo=pytz.utc) dt = parser.parse(from_date) utc_formatted_time = dt.replace(tzinfo=pytz.utc) if dt.utcoffset() is not None: utc_formatted_time += dt.utcoffset() return utc_formatted_time def string_to_utc_string(from_date): return utc_date_to_string(string_to_utc_date(from_date)) def date_to_rfc2822_string(from_date): return from_date.strftime('%a, %d %b %Y %H:%M:%S %z') def validate_domain_string(domain_string): """Validates domain query parameter syntax. Examples: All the following examples are well formed: `Travel` `History OR Travel` `gs:uk OR gs:mx OR gs:pt` `gs:uk or gs:mx or gs:pt` `Travel,History` The following examples are incorrect: `gs:uk gs:mx gs:pt` `Travel;History` `Travel, History` `Travel , History` """ allowed_separator = 'OR' domain_regex = re.compile('^([a-zA-Z:]+\s)*[a-zA-Z:]+$') valid_domains = all(domain_regex.match(bit) is not None for bit in domain_string.split(',')) valid_separators = all(sep.upper() == allowed_separator for sep in domain_string.split()[1::2]) if not valid_domains or not valid_separators: raise ValueError( ('Invalid Domain. Domain must contain only characters or spaces.\n' 'For example: domain=Travel or domain=News,Travel or domain=News OR Travel') ) return domain_string def validate_rfc3339_date(datestring): rfc3339_regex = re.compile( r"^(\d\d\d\d)\-(\d\d)\-(\d\d)T" r"(\d\d):(\d\d):(\d\d)(\.\d+)?(Z|([+\-])(\d\d):(\d\d))$") m = rfc3339_regex.match(datestring) if m is None: raise ValueError('Invalid format. Datetime format RFC3339. Example: 2015-08-13T18:30:02Z') # This will generate an error if the datetime can't parse to a real date eg: month 2 day 30 # when a parse fails, it returns a ValueError with a python error message parser.parse(datestring) return datestring def validate_hours_day_format(datestring): try: number = int(datestring[:-1]) if datestring.endswith('d') and 1 <= number <= 100: return datestring elif datestring.endswith('h') and 1 <= number <= 23: return datestring else: raise ValueError() except ValueError: raise ValueError('Invalid format. Range: 1-23h 1-100d. Example: 7d, 5h') def validate_uri_format(obj_uri, query): parsed_obj_uri = urlparse(obj_uri) obj_path = parsed_obj_uri.path[1:] query = fix_uri_trailing_slash(query) if obj_path[-1] == '/' else query if obj_path != query: return False return True def fix_uri_trailing_slash(uri): if uri[-1] != '/': uri += '/' return uri def get_worst_status(status_list): """ Return the worst status based on a status list. :param status_list: The list of status, the status can be the string 'green', 'yellow' or 'red', invalid status are ignored. :return: The worst status from the list (green, yellow or red). """ if 'red' in status_list: return 'red' elif 'yellow' in status_list: return 'yellow' else: return 'green' ########################################## # Used to deal with invalid xml characters ########################################## def invalid_xml_remove(text): # http://stackoverflow.com/questions/1707890/fast-way-to-filter-illegal-xml-unicode-chars-in-python illegal_unichrs = [ (0x00, 0x08), (0x0B, 0x1F), (0x7F, 0x84), (0x86, 0x9F), (0xD800, 0xDFFF), (0xFDD0, 0xFDDF), (0xFFFE, 0xFFFF), (0x1FFFE, 0x1FFFF), (0x2FFFE, 0x2FFFF), (0x3FFFE, 0x3FFFF), (0x4FFFE, 0x4FFFF), (0x5FFFE, 0x5FFFF), (0x6FFFE, 0x6FFFF), (0x7FFFE, 0x7FFFF), (0x8FFFE, 0x8FFFF), (0x9FFFE, 0x9FFFF), (0xAFFFE, 0xAFFFF), (0xBFFFE, 0xBFFFF), (0xCFFFE, 0xCFFFF), (0xDFFFE, 0xDFFFF), (0xEFFFE, 0xEFFFF), (0xFFFFE, 0xFFFFF), (0x10FFFE, 0x10FFFF)] illegal_ranges = ["%s-%s" % (unichr(low), unichr(high)) for (low, high) in illegal_unichrs if low < sys.maxunicode] illegal_xml_re = re.compile(u'[%s]' % u''.join(illegal_ranges)) return illegal_xml_re.sub('', text) def scrub_literal(value): """ Scrubs control characters from the incoming values to remove things like form feeds (\f) and line breaks (\n) which might cause problems with Jena. Data with these characters was found in the Backstage data. """ if not value: return if (type(value) == long) or (type(value) == int): return value n = ''.join([c for c in value if not ascii.iscntrl(c) if not ascii.isctrl(c)]) n = n.replace('"', '') n = n.replace('\ufffd', '') n = clean_char(n) if type(n) != unicode: n = unicode(n, errors='replace') return n.strip() def sanitize_xml_text(text): """ Removes all invalid XML characters and escape al special characters. """ if text: return escape(scrub_literal(invalid_xml_remove(text))) return text def clean_char(char): """ Function for remove invalid XML characters from incoming data. """ # Get rid of the ctrl characters first. # http://stackoverflow.com/questions/1833873/python-regex-escape-characters char = re.sub('\x1b[^m]*m', '', char) # Clean up invalid xml char = invalid_xml_remove(char) replacements = [ (u'\u201c', '\"'), (u'\u201d', '\"'), (u"\u001B", ' '), # http://www.fileformat.info/info/unicode/char/1b/index.htm (u"\u0019", ' '), # http://www.fileformat.info/info/unicode/char/19/index.htm (u"\u0016", ' '), # http://www.fileformat.info/info/unicode/char/16/index.htm (u"\u001C", ' '), # http://www.fileformat.info/info/unicode/char/1c/index.htm (u"\u0003", ' '), # http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=0x (u"\u000C", ' ') ] for rep, new_char in replacements: if char == rep: return new_char return char