[ckan-changes] commit/datautil-date: dread: Copied just the date stuff from github datautil, for ckan use.
Bitbucket
commits-noreply at bitbucket.org
Tue Sep 27 18:48:34 UTC 2011
1 new changeset in datautil-date:
http://bitbucket.org/okfn/datautil-date/changeset/2dc27829de20/
changeset: 2dc27829de20
user: dread
date: 2011-09-27 20:46:22
summary: Copied just the date stuff from github datautil, for ckan use.
affected #: 5 files (-1 bytes)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datautil/__init__.py Tue Sep 27 19:46:22 2011 +0100
@@ -0,0 +1,1 @@
+__version__ = '0.4'
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datautil/date.py Tue Sep 27 19:46:22 2011 +0100
@@ -0,0 +1,282 @@
+'''Date parsing and normalization utilities based on FlexiDate.
+
+To parser dates use parse, e.g.::
+
+ parse('1890') -> FlexiDate(year=u'1890')
+ parse('1890?') -> FlexiDate(year=u'1890', qualifier='Uncertainty: 1985?')
+
+Once you have a FlexiDate you can get access to attributes (strings of course
+...)::
+
+ fd = parse('Jan 1890')
+ fd.year # u'1890'
+ fd.month # u'01'
+
+And convert to other forms:
+
+ fd.as_float() # 1890
+ fd.as_datetime() # datetime(1890,01,01)
+
+Background
+==========
+
+FlexiDate is focused on supporting:
+
+ 1. Dates outside of Python (or DB) supported period (esp. dates < 0 AD)
+ 2. Imprecise dates (c.1860, 18??, fl. 1534, etc)
+ 3. Normalization of dates to machine processable versions
+ 4. Sortable in the database (in correct date order)
+
+For more information see:
+
+http://www.rufuspollock.org/2009/06/18/flexible-dates-in-python/
+'''
+import re
+import datetime
+
+class FlexiDate(object):
+ """Store dates as strings and present them in a slightly extended version
+ of ISO8601.
+
+ Modifications:
+ * Allow a trailing qualifiers e.g. fl.
+ * Allow replacement of unknown values by ? e.g. if sometime in 1800s
+ can do 18??
+
+ Restriction on ISO8601:
+ * Truncation (e.g. of centuries) is *not* permitted.
+ * No week and day representation e.g. 1999-W01
+ """
+ # pass
+ def __init__(self, year=None, month=None, day=None, qualifier=''):
+ # force = month or day or qualifier
+ force = False
+ self.year = self._cvt(year, rjust=4, force=force)
+ self.month = self._cvt(month)
+ self.day = self._cvt(day)
+ self.qualifier = qualifier
+
+ def _cvt(self, val, rjust=2, force=False):
+ if val:
+ tmp = unicode(val).strip()
+ if tmp.startswith('-'):
+ tmp = '-' + tmp[1:].rjust(rjust, '0')
+ else:
+ tmp = tmp.rjust(rjust, '0')
+ return tmp
+ elif force:
+ # use '!' rather than '?' as '!' < '1' while '?' > '1'
+ return rjust * '!'
+ else:
+ return ''
+
+ def __str__(self):
+ out = self.isoformat()
+ if self.qualifier:
+ # leading space is important as ensures when no year sort in right
+ # order as ' ' < '1'
+ out += u' [%s]' % self.qualifier
+ return out
+
+ def __repr__(self):
+ return u'%s %s' % (self.__class__, self.__str__())
+
+ def isoformat(self, strict=False):
+ '''Return date in isoformat (same as __str__ but without qualifier).
+
+ WARNING: does not replace '?' in dates unless strict=True.
+ '''
+ out = self.year
+ # what do we do when no year ...
+ for val in [ self.month, self.day ]:
+ if not val:
+ break
+ out += u'-' + val
+ if strict:
+ out = out.replace('?', '0')
+ return out
+
+ our_re_pat = '''
+ (?P<year> -?[\d?]+)
+ (?:
+ \s* - (?P<month> [\d?]{1,2})
+ (?: \s* - (?P<day> [\d?]{1,2}) )?
+ )?
+ \s*
+ (?: \[ (?P<qualifier>[^]]*) \])?
+ '''
+ our_re = re.compile(our_re_pat, re.VERBOSE)
+ @classmethod
+ def from_str(self, instr):
+ '''Undo affect of __str__'''
+ if not instr:
+ return FlexiDate()
+
+ out = self.our_re.match(instr)
+ if out is None: # no match TODO: raise Exception?
+ return None
+ else:
+ return FlexiDate(
+ out.group('year'),
+ out.group('month'),
+ out.group('day'),
+ qualifier=out.group('qualifier')
+ )
+
+ def as_float(self):
+ '''Get as a float (year being the integer part).
+
+ Replace '?' in year with 9 so as to be conservative (e.g. 19?? becomes
+ 1999) and elsewhere (month, day) with 0
+
+ @return: float.
+ '''
+ if not self.year: return None
+ out = float(self.year.replace('?', '9'))
+ if self.month:
+ # TODO: we are assuming months are of equal length
+ out += float(self.month.replace('?', '0')) / 12.0
+ if self.day:
+ out += float(self.day.replace('?', '0')) / 365.0
+ return out
+
+ def as_datetime(self):
+ '''Get as python datetime.datetime.
+
+ Require year to be a valid datetime year. Default month and day to 1 if
+ do not exist.
+
+ @return: datetime.datetime object.
+ '''
+ year = int(self.year)
+ month = int(self.month) if self.month else 1
+ day = int(self.day) if self.day else 1
+ return datetime.datetime(year, month, day)
+
+
+def parse(date, dayfirst=True):
+ '''Parse a `date` into a `FlexiDate`.
+
+ @param date: the date to parse - may be a string, datetime.date,
+ datetime.datetime or FlexiDate.
+
+ TODO: support for quarters e.g. Q4 1980 or 1954 Q3
+ TODO: support latin stuff like M.DCC.LIII
+ TODO: convert '-' to '?' when used that way
+ e.g. had this date [181-]
+ '''
+ if not date:
+ return None
+ if isinstance(date, FlexiDate):
+ return date
+ if isinstance(date, int):
+ return FlexiDate(year=date)
+ elif isinstance(date, datetime.date):
+ parser = PythonDateParser()
+ return parser.parse(date)
+ else: # assuming its a string
+ parser = DateutilDateParser()
+ out = parser.parse(date, **{'dayfirst': dayfirst})
+ if out is not None:
+ return out
+ # msg = 'Unable to parse %s' % date
+ # raise ValueError(date)
+ val = 'UNPARSED: %s' % date
+ val = val.encode('ascii', 'ignore')
+ return FlexiDate(qualifier=val)
+
+
+class DateParserBase(object):
+ def parse(self, date):
+ raise NotImplementedError
+
+ def norm(self, date):
+ return str(self.parse(date))
+
+class PythonDateParser(object):
+ def parse(self, date):
+ return FlexiDate(date.year, date.month, date.day)
+
+try:
+ import dateutil.parser
+ dateutil_parser = dateutil.parser.parser()
+except:
+ dateutil_parser = None
+
+class DateutilDateParser(DateParserBase):
+ _numeric = re.compile("^[0-9]+$")
+ def parse(self, date, **kwargs):
+ '''
+ :param **kwargs: any kwargs accepted by dateutil.parse function.
+ '''
+ qualifiers = []
+ if dateutil_parser is None:
+ return None
+ date = orig_date = date.strip()
+
+ # various normalizations
+ # TODO: call .lower() first
+ date = date.replace('B.C.', 'BC')
+ date = date.replace('A.D.', 'AD')
+
+ # deal with pre 0AD dates
+ if date.startswith('-') or 'BC' in date or 'B.C.' in date:
+ pre0AD = True
+ else:
+ pre0AD = False
+ # BC seems to mess up parser
+ date = date.replace('BC', '')
+
+ # deal with circa: 'c.1950' or 'c1950'
+ circa_match = re.match('([^a-zA-Z]*)c\.?\s*(\d+.*)', date)
+ if circa_match:
+ # remove circa bit
+ qualifiers.append("Note 'circa'")
+ date = ''.join(circa_match.groups())
+
+ # deal with p1980 (what does this mean? it can appear in
+ # field 008 of MARC records
+ p_match = re.match("^p(\d+)", date)
+ if p_match:
+ date = date[1:]
+
+ # Deal with uncertainty: '1985?'
+ uncertainty_match = re.match('([0-9xX]{4})\?', date)
+ if uncertainty_match:
+ # remove the ?
+ date = date[:-1]
+ qualifiers.append('Uncertainty')
+
+ # Parse the numbers intelligently
+ # do not use std parser function as creates lots of default data
+ res = dateutil_parser._parse(date, **kwargs)
+
+ if res is None:
+ # Couldn't parse it
+ return None
+ #Note: Years of less than 3 digits not interpreted by
+ # dateutil correctly
+ # e.g. 87 -> 1987
+ # 4 -> day 4 (no year)
+ # Both cases are handled in this routine
+ if res.year is None and res.day:
+ year = res.day
+ # If the whole date is simply two digits then dateutil_parser makes
+ # it '86' -> '1986'. So strip off the '19'. (If the date specified
+ # day/month then a two digit year is more likely to be this century
+ # and so allow the '19' prefix to it.)
+ elif self._numeric.match(date) and (len(date) == 2 or date.startswith('00')):
+ year = res.year % 100
+ else:
+ year = res.year
+
+ # finally add back in BC stuff
+ if pre0AD:
+ year = -year
+
+ if not qualifiers:
+ qualifier = ''
+ else:
+ qualifier = ', '.join(qualifiers) + (' : %s' % orig_date)
+ return FlexiDate(year, res.month, res.day, qualifier=qualifier)
+
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datautil/tests/__init__.py Tue Sep 27 19:46:22 2011 +0100
@@ -0,0 +1,1 @@
+__version__ = '0.4'
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datautil/tests/test_date.py Tue Sep 27 19:46:22 2011 +0100
@@ -0,0 +1,207 @@
+from datautil.date import *
+
+import datetime
+
+class TestPythonStringOrdering(object):
+ # It is impossible to find a string format such that +ve and -ve numbers
+ # sort correctly as strings:
+ # if (in string ordering) X < Y => -X < -Y (False!)
+ def test_ordering(self):
+ assert '0' < '1'
+ assert '-10' < '10'
+ assert '-' < '@'
+ assert '-' < '0'
+ assert '-100' < '-X10'
+ assert '10' < '1000'
+ assert '02000' < '10000'
+ assert ' 2000' < '10000'
+
+ def test_bad_ordering(self):
+ assert ' ' < '0'
+ assert ' ' < '-'
+ assert not '-' < '+'
+ assert '-100' > '-10'
+ assert not '-100' < '-010'
+ assert not '-100' < '- 10'
+ assert not '-100' < ' -10'
+ assert '10000' < '2000'
+ assert not '-10' < ' 1'
+
+
+class TestFlexiDate(object):
+ def test_init(self):
+ fd = FlexiDate()
+ assert fd.year == '', fd
+ assert fd.month == '', fd
+
+ fd = FlexiDate(2000, 1,1)
+ assert fd.month == '01', fd
+ assert fd.day== '01', fd
+
+ def test_str(self):
+ fd = FlexiDate(2000, 1, 23)
+ assert str(fd) == '2000-01-23', '"%s"' % fd
+ fd = FlexiDate(-2000, 1, 23)
+ assert str(fd) == '-2000-01-23'
+ fd = FlexiDate(2000)
+ assert str(fd) == '2000'
+ fd = FlexiDate(1760, qualifier='fl.')
+ assert str(fd) == '1760 [fl.]', fd
+
+ fd = FlexiDate(qualifier='anything')
+ assert str(fd) == ' [anything]'
+
+
+ def test_from_str(self):
+ def dotest(fd):
+ out = FlexiDate.from_str(str(fd))
+ assert str(out) == str(fd)
+
+ fd = FlexiDate(2000, 1, 23)
+ dotest(fd)
+ fd = FlexiDate(1760, qualifier='fl.')
+ dotest(fd)
+ fd = FlexiDate(-1760, 1, 3, qualifier='fl.')
+ dotest(fd)
+
+ def test_as_float(self):
+ fd = FlexiDate(2000)
+ assert fd.as_float() == float(2000), fd.as_float()
+ fd = FlexiDate(1760, 1, 2)
+ exp = 1760 + 1/12.0 + 2/365.0
+ assert fd.as_float() == exp, fd.as_float()
+ fd = FlexiDate(-1000)
+ assert fd.as_float() == float(-1000)
+
+ def test_as_datetime(self):
+ fd = FlexiDate(2000)
+ out = fd.as_datetime()
+ assert out == datetime.datetime(2000, 1, 1), out
+ fd = FlexiDate(1760, 1, 2)
+ out = fd.as_datetime()
+ assert out == datetime.datetime(1760,1,2), out
+
+
+class TestDateParsers(object):
+ def test_using_datetime(self):
+ parser = PythonDateParser()
+
+ d1 = datetime.date(2000, 1, 23)
+ fd = parser.parse(d1)
+ assert fd.year == '2000'
+
+ d1 = datetime.datetime(2000, 1, 23)
+ fd = parser.parse(d1)
+ # assert str(fd) == '2000-01-23T00:00:00', fd
+ assert str(fd) == '2000-01-23', fd
+
+ def test_using_dateutil(self):
+ parser = DateutilDateParser()
+
+ in1 = '2001-02'
+ fd = parser.parse(in1)
+ assert str(fd) == in1, fd
+
+ in1 = 'March 1762'
+ fd = parser.parse(in1)
+ assert str(fd) == '1762-03'
+
+ in1 = 'March 1762'
+ fd = parser.parse(in1)
+ assert str(fd) == '1762-03'
+
+ in1 = '1768 AD'
+ fd = parser.parse(in1)
+ assert str(fd) == '1768', fd
+
+ in1 = '1768 A.D.'
+ fd = parser.parse(in1)
+ assert str(fd) == '1768', fd
+
+ in1 = '-1850'
+ fd = parser.parse(in1)
+ assert str(fd) == '-1850', fd
+
+ in1 = '1762 BC'
+ fd = parser.parse(in1)
+ assert str(fd) == '-1762', fd
+
+ in1 = '4 BC'
+ fd = parser.parse(in1)
+ assert str(fd) == '-0004', fd
+
+ in1 = '4 B.C.'
+ fd = parser.parse(in1)
+ assert str(fd) == '-0004', fd
+
+ in1 = 'Wed, 06 Jan 2010 09:30:00 GMT'
+ fd = parser.parse(in1)
+ assert str(fd) == '2010-01-06', fd
+
+ in1 = 'Tue, 07 Dec 2010 10:00:00 GMT'
+ fd = parser.parse(in1)
+ assert str(fd) == '2010-12-07', fd
+
+ def test_parse(self):
+ d1 = datetime.datetime(2000, 1, 23)
+ fd = parse(d1)
+ assert fd.year == '2000'
+
+ fd = parse('March 1762')
+ assert str(fd) == '1762-03'
+
+ fd = parse(1966)
+ assert str(fd) == '1966'
+
+ fd = parse('22/07/2010')
+ assert fd.month == '07', fd.month
+
+ def test_parse_ambiguous_day_month(self):
+ fd = parse('05/07/2010')
+ assert fd.month == '07', fd.month
+ assert fd.day == '05', fd.month
+
+ def test_parse_with_none(self):
+ d1 = parse(None)
+ assert d1 is None
+
+ def test_parse_wildcards(self):
+ fd = parse('198?')
+ assert fd.year == '', fd.year # expect this to not parse
+ # TODO but we should have a float if possible
+# assert fd.as_float() == u'1980', fd.as_float()
+
+ def test_parse_with_qualifiers(self):
+ fd = parse('1985?')
+ assert fd.year == u'1985', fd
+ assert fd.qualifier == u'Uncertainty : 1985?', fd.qualifier
+
+ fd = parse('c.1780')
+ assert fd.year == u'1780', fd
+ assert fd.qualifier == u"Note 'circa' : c.1780", fd
+
+ fd = parse('c. 1780')
+ assert fd.year == u'1780', fd
+ assert fd.qualifier.startswith(u"Note 'circa'"), fd
+
+ def test_ambiguous(self):
+ # TODO: have to be careful here ...
+ fd = parse('1068/1069')
+
+ def test_small_years(self):
+ in1 = '23'
+ fd = parse(in1)
+ assert str(fd) == '0023', fd
+ assert fd.as_float() == 23, fd.as_float()
+
+ def test_small_years_with_zeros(self):
+ in1 = '0023'
+ fd = parse(in1)
+ assert str(fd) == '0023', fd
+ assert fd.as_float() == 23, fd.as_float()
+
+ def test_years_with_alpha_prefix(self):
+ in1 = "p1980"
+ fd = parse(in1)
+ assert str(fd) == "1980", fd
+
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/setup.py Tue Sep 27 19:46:22 2011 +0100
@@ -0,0 +1,36 @@
+from setuptools import setup, find_packages
+
+import sys
+sys.path.insert(0, '.')
+from datautil import __version__, __doc__ as __long_description__
+
+setup(
+ name='datautil-date',
+ version=__version__,
+ license='MIT',
+ description='Date Utilities for Data Work',
+ long_description=__long_description__,
+ author='Open Knowledge Foundation',
+ author_email='info at okfn.org',
+ url='http://okfn.org/projects/datautil/',
+ download_url='http://bitbucket.org/okfn/datautil-date/',
+ install_requires=[
+ # python-dateutil 2.0 has different _parse method, so stick to 1.4.1
+ 'python-dateutil>=1.0,<1.99',
+ # (optional) for excel handling
+ # xlrd
+ # (optional) for google docs handling
+ # gdata
+ ],
+ packages=find_packages(),
+ include_package_data=True,
+ zip_safe=False,
+ classifiers = [
+ 'Development Status :: 5 - Production/Stable',
+ 'Environment :: Console',
+ 'Intended Audience :: Developers',
+ 'Operating System :: OS Independent',
+ 'Programming Language :: Python',
+ 'Topic :: Software Development :: Libraries :: Python Modules'
+ ],
+)
Repository URL: https://bitbucket.org/okfn/datautil-date/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
More information about the ckan-changes
mailing list