[ckan-changes] commit/datautil-date: dread: Copied just the date stuff from github datautil, for ckan use.

Bitbucket commits-noreply at bitbucket.org
Tue Sep 27 18:48:34 UTC 2011


1 new changeset in datautil-date:

http://bitbucket.org/okfn/datautil-date/changeset/2dc27829de20/
changeset:   2dc27829de20
user:        dread
date:        2011-09-27 20:46:22
summary:     Copied just the date stuff from github datautil, for ckan use.
affected #:  5 files (-1 bytes)

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datautil/__init__.py	Tue Sep 27 19:46:22 2011 +0100
@@ -0,0 +1,1 @@
+__version__ = '0.4'


--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datautil/date.py	Tue Sep 27 19:46:22 2011 +0100
@@ -0,0 +1,282 @@
+'''Date parsing and normalization utilities based on FlexiDate.
+
+To parser dates use parse, e.g.::
+
+    parse('1890') -> FlexiDate(year=u'1890')
+    parse('1890?') -> FlexiDate(year=u'1890', qualifier='Uncertainty: 1985?')
+
+Once you have a FlexiDate you can get access to attributes (strings of course
+...)::
+
+    fd = parse('Jan 1890')
+    fd.year # u'1890'
+    fd.month # u'01'
+
+And convert to other forms:
+
+    fd.as_float() # 1890
+    fd.as_datetime() # datetime(1890,01,01)
+
+Background
+==========
+
+FlexiDate is focused on supporting:
+
+  1. Dates outside of Python (or DB) supported period (esp. dates < 0 AD)
+  2. Imprecise dates (c.1860, 18??, fl. 1534, etc)
+  3. Normalization of dates to machine processable versions
+  4. Sortable in the database (in correct date order)
+
+For more information see:
+
+http://www.rufuspollock.org/2009/06/18/flexible-dates-in-python/
+'''
+import re
+import datetime
+
+class FlexiDate(object):
+    """Store dates as strings and present them in a slightly extended version
+    of ISO8601.
+
+    Modifications:
+        * Allow a trailing qualifiers e.g. fl.
+        * Allow replacement of unknown values by ? e.g. if sometime in 1800s
+          can do 18??
+    
+    Restriction on ISO8601:
+        * Truncation (e.g. of centuries) is *not* permitted.
+        * No week and day representation e.g. 1999-W01
+    """
+    # pass
+    def __init__(self, year=None, month=None, day=None, qualifier=''):
+        # force = month or day or qualifier
+        force = False
+        self.year = self._cvt(year, rjust=4, force=force)
+        self.month = self._cvt(month)
+        self.day = self._cvt(day)
+        self.qualifier = qualifier
+         
+    def _cvt(self, val, rjust=2, force=False):
+        if val:
+            tmp = unicode(val).strip()
+            if tmp.startswith('-'):
+                tmp = '-' + tmp[1:].rjust(rjust, '0')
+            else:
+                tmp = tmp.rjust(rjust, '0')
+            return tmp
+        elif force:
+            # use '!' rather than '?' as '!' < '1' while '?' > '1'
+            return rjust * '!'
+        else:
+            return ''
+
+    def __str__(self):
+        out = self.isoformat()
+        if self.qualifier:
+            # leading space is important as ensures when no year sort in right
+            # order as ' ' < '1'
+            out += u' [%s]' % self.qualifier
+        return out
+
+    def __repr__(self):
+        return u'%s %s' % (self.__class__, self.__str__())
+
+    def isoformat(self, strict=False):
+        '''Return date in isoformat (same as __str__ but without qualifier).
+        
+        WARNING: does not replace '?' in dates unless strict=True.
+        '''
+        out = self.year
+        # what do we do when no year ...
+        for val in [ self.month, self.day ]:
+            if not val:
+                break
+            out += u'-' + val
+        if strict:
+            out = out.replace('?', '0')
+        return out
+
+    our_re_pat = '''
+        (?P<year> -?[\d?]+)
+        (?:
+                \s* - (?P<month> [\d?]{1,2})
+            (?: \s* - (?P<day> [\d?]{1,2}) )?
+        )?
+        \s*
+        (?: \[ (?P<qualifier>[^]]*) \])?
+        '''
+    our_re = re.compile(our_re_pat, re.VERBOSE)
+    @classmethod
+    def from_str(self, instr):
+        '''Undo affect of __str__'''
+        if not instr:
+            return FlexiDate()
+
+        out = self.our_re.match(instr)
+        if out is None: # no match TODO: raise Exception?
+            return None
+        else:
+            return FlexiDate(
+                    out.group('year'),
+                    out.group('month'),
+                    out.group('day'),
+                    qualifier=out.group('qualifier')
+                    )
+    
+    def as_float(self):
+        '''Get as a float (year being the integer part).
+
+        Replace '?' in year with 9 so as to be conservative (e.g. 19?? becomes
+        1999) and elsewhere (month, day) with 0
+
+        @return: float.
+        '''
+        if not self.year: return None
+        out = float(self.year.replace('?', '9'))
+        if self.month:
+            # TODO: we are assuming months are of equal length
+            out += float(self.month.replace('?', '0')) / 12.0
+            if self.day:
+                out += float(self.day.replace('?', '0')) / 365.0
+        return out
+
+    def as_datetime(self):
+        '''Get as python datetime.datetime.
+
+        Require year to be a valid datetime year. Default month and day to 1 if
+        do not exist.
+
+        @return: datetime.datetime object.
+        '''
+        year = int(self.year)
+        month = int(self.month) if self.month else 1
+        day = int(self.day) if self.day else 1
+        return datetime.datetime(year, month, day)
+
+
+def parse(date, dayfirst=True):
+    '''Parse a `date` into a `FlexiDate`.
+
+    @param date: the date to parse - may be a string, datetime.date,
+    datetime.datetime or FlexiDate.
+
+    TODO: support for quarters e.g. Q4 1980 or 1954 Q3
+    TODO: support latin stuff like M.DCC.LIII  
+    TODO: convert '-' to '?' when used that way
+        e.g. had this date [181-]
+    '''
+    if not date:
+        return None
+    if isinstance(date, FlexiDate):
+        return date
+    if isinstance(date, int):
+        return FlexiDate(year=date)
+    elif isinstance(date, datetime.date):
+        parser = PythonDateParser()
+        return parser.parse(date)
+    else: # assuming its a string
+        parser = DateutilDateParser()
+        out = parser.parse(date, **{'dayfirst': dayfirst})
+        if out is not None:
+            return out
+        # msg = 'Unable to parse %s' % date
+        # raise ValueError(date)
+        val = 'UNPARSED: %s' % date
+        val = val.encode('ascii', 'ignore')
+        return FlexiDate(qualifier=val)
+
+
+class DateParserBase(object):
+    def parse(self, date):
+        raise NotImplementedError
+
+    def norm(self, date):
+        return str(self.parse(date))
+
+class PythonDateParser(object):
+    def parse(self, date):
+        return FlexiDate(date.year, date.month, date.day)
+
+try:
+    import dateutil.parser
+    dateutil_parser = dateutil.parser.parser()
+except:
+    dateutil_parser = None
+
+class DateutilDateParser(DateParserBase):
+    _numeric = re.compile("^[0-9]+$")
+    def parse(self, date, **kwargs):
+        '''
+        :param **kwargs: any kwargs accepted by dateutil.parse function.
+        '''
+        qualifiers = []
+        if dateutil_parser is None:
+            return None
+        date = orig_date = date.strip()
+
+        # various normalizations
+        # TODO: call .lower() first
+        date = date.replace('B.C.', 'BC')
+        date = date.replace('A.D.', 'AD')
+
+        # deal with pre 0AD dates
+        if date.startswith('-') or 'BC' in date or 'B.C.' in date:
+            pre0AD = True
+        else:
+            pre0AD = False
+        # BC seems to mess up parser
+        date = date.replace('BC', '')
+
+        # deal with circa: 'c.1950' or 'c1950'
+        circa_match = re.match('([^a-zA-Z]*)c\.?\s*(\d+.*)', date)
+        if circa_match:
+            # remove circa bit
+            qualifiers.append("Note 'circa'")
+            date = ''.join(circa_match.groups())
+
+        # deal with p1980 (what does this mean? it can appear in
+        # field 008 of MARC records
+        p_match = re.match("^p(\d+)", date)
+        if p_match:
+            date = date[1:]
+
+        # Deal with uncertainty: '1985?'
+        uncertainty_match = re.match('([0-9xX]{4})\?', date)
+        if uncertainty_match:
+            # remove the ?
+            date = date[:-1]
+            qualifiers.append('Uncertainty')
+
+        # Parse the numbers intelligently
+        # do not use std parser function as creates lots of default data
+        res = dateutil_parser._parse(date, **kwargs)
+
+        if res is None:
+            # Couldn't parse it
+            return None
+        #Note: Years of less than 3 digits not interpreted by
+        #      dateutil correctly
+        #      e.g. 87 -> 1987
+        #           4  -> day 4 (no year)
+        # Both cases are handled in this routine
+        if res.year is None and res.day:
+            year = res.day
+        # If the whole date is simply two digits then dateutil_parser makes
+        # it '86' -> '1986'. So strip off the '19'. (If the date specified
+        # day/month then a two digit year is more likely to be this century
+        # and so allow the '19' prefix to it.)
+        elif self._numeric.match(date) and (len(date) == 2 or date.startswith('00')):
+            year = res.year % 100
+        else:
+            year = res.year
+
+        # finally add back in BC stuff
+        if pre0AD:
+            year = -year
+            
+        if not qualifiers:
+            qualifier = ''
+        else:
+            qualifier = ', '.join(qualifiers) + (' : %s' % orig_date)
+        return FlexiDate(year, res.month, res.day, qualifier=qualifier)
+    


--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datautil/tests/__init__.py	Tue Sep 27 19:46:22 2011 +0100
@@ -0,0 +1,1 @@
+__version__ = '0.4'


--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datautil/tests/test_date.py	Tue Sep 27 19:46:22 2011 +0100
@@ -0,0 +1,207 @@
+from datautil.date import *
+
+import datetime
+
+class TestPythonStringOrdering(object):
+    # It is impossible to find a string format such that +ve and -ve numbers
+    # sort correctly as strings:
+    # if (in string ordering) X < Y => -X < -Y (False!)
+    def test_ordering(self):
+        assert '0' < '1'
+        assert '-10' < '10'
+        assert '-' < '@'
+        assert '-' < '0'
+        assert '-100' < '-X10'
+        assert '10' < '1000'
+        assert '02000' < '10000'
+        assert ' 2000' < '10000'
+
+    def test_bad_ordering(self):
+        assert ' ' < '0'
+        assert ' ' < '-'
+        assert not '-' < '+'
+        assert '-100' > '-10'
+        assert not '-100' < '-010'
+        assert not '-100' < '- 10'
+        assert not '-100' < ' -10'
+        assert '10000' < '2000'
+        assert not '-10' < ' 1'
+        
+
+class TestFlexiDate(object):
+    def test_init(self):
+        fd = FlexiDate()
+        assert fd.year == '', fd
+        assert fd.month == '', fd
+
+        fd = FlexiDate(2000, 1,1)
+        assert fd.month == '01', fd
+        assert fd.day== '01', fd
+
+    def test_str(self):
+        fd = FlexiDate(2000, 1, 23)
+        assert str(fd) == '2000-01-23', '"%s"' % fd
+        fd = FlexiDate(-2000, 1, 23)
+        assert str(fd) == '-2000-01-23'
+        fd = FlexiDate(2000)
+        assert str(fd) == '2000'
+        fd = FlexiDate(1760, qualifier='fl.')
+        assert str(fd) == '1760 [fl.]', fd
+
+        fd = FlexiDate(qualifier='anything')
+        assert str(fd) == ' [anything]'
+
+
+    def test_from_str(self):
+        def dotest(fd):
+            out = FlexiDate.from_str(str(fd))
+            assert str(out) == str(fd)
+
+        fd = FlexiDate(2000, 1, 23)
+        dotest(fd)
+        fd = FlexiDate(1760, qualifier='fl.')
+        dotest(fd)
+        fd = FlexiDate(-1760, 1, 3, qualifier='fl.')
+        dotest(fd)
+    
+    def test_as_float(self):
+        fd = FlexiDate(2000)
+        assert fd.as_float() == float(2000), fd.as_float()
+        fd = FlexiDate(1760, 1, 2)
+        exp = 1760 + 1/12.0 + 2/365.0
+        assert fd.as_float() == exp, fd.as_float()
+        fd = FlexiDate(-1000)
+        assert fd.as_float() == float(-1000)
+
+    def test_as_datetime(self):
+        fd = FlexiDate(2000)
+        out = fd.as_datetime()
+        assert out == datetime.datetime(2000, 1, 1), out
+        fd = FlexiDate(1760, 1, 2)
+        out = fd.as_datetime()
+        assert out == datetime.datetime(1760,1,2), out
+
+
+class TestDateParsers(object):
+    def test_using_datetime(self):
+        parser = PythonDateParser()
+
+        d1 = datetime.date(2000, 1, 23)
+        fd = parser.parse(d1)
+        assert fd.year == '2000'
+
+        d1 = datetime.datetime(2000, 1, 23)
+        fd = parser.parse(d1)
+        # assert str(fd) == '2000-01-23T00:00:00', fd
+        assert str(fd) == '2000-01-23', fd
+
+    def test_using_dateutil(self):
+        parser = DateutilDateParser()
+
+        in1 = '2001-02'
+        fd = parser.parse(in1)
+        assert str(fd) == in1, fd
+
+        in1 = 'March 1762'
+        fd = parser.parse(in1)
+        assert str(fd) == '1762-03'
+
+        in1 = 'March 1762'
+        fd = parser.parse(in1)
+        assert str(fd) == '1762-03'
+
+        in1 = '1768 AD'
+        fd = parser.parse(in1)
+        assert str(fd) == '1768', fd
+
+        in1 = '1768 A.D.'
+        fd = parser.parse(in1)
+        assert str(fd) == '1768', fd
+
+        in1 = '-1850'
+        fd = parser.parse(in1)
+        assert str(fd) == '-1850', fd
+
+        in1 = '1762 BC'
+        fd = parser.parse(in1)
+        assert str(fd) == '-1762', fd
+
+        in1 = '4 BC'
+        fd = parser.parse(in1)
+        assert str(fd) == '-0004', fd
+
+        in1 = '4 B.C.'
+        fd = parser.parse(in1)
+        assert str(fd) == '-0004', fd
+
+        in1 = 'Wed, 06 Jan 2010 09:30:00 GMT'
+        fd = parser.parse(in1)
+        assert str(fd) == '2010-01-06', fd
+
+        in1 = 'Tue, 07 Dec 2010 10:00:00 GMT'
+        fd = parser.parse(in1)
+        assert str(fd) == '2010-12-07', fd
+
+    def test_parse(self):
+        d1 = datetime.datetime(2000, 1, 23)
+        fd = parse(d1)
+        assert fd.year == '2000'
+
+        fd = parse('March 1762')
+        assert str(fd) == '1762-03'
+
+        fd = parse(1966)
+        assert str(fd) == '1966'
+
+        fd = parse('22/07/2010')
+        assert fd.month == '07', fd.month
+
+    def test_parse_ambiguous_day_month(self):
+        fd = parse('05/07/2010')
+        assert fd.month == '07', fd.month
+        assert fd.day == '05', fd.month
+
+    def test_parse_with_none(self):
+        d1 = parse(None)
+        assert d1 is None
+    
+    def test_parse_wildcards(self):
+        fd = parse('198?')
+        assert fd.year == '', fd.year # expect this to not parse
+        # TODO but we should have a float if possible
+#        assert fd.as_float() == u'1980', fd.as_float()
+
+    def test_parse_with_qualifiers(self):
+        fd = parse('1985?')
+        assert fd.year == u'1985', fd
+        assert fd.qualifier == u'Uncertainty : 1985?', fd.qualifier
+
+        fd = parse('c.1780')
+        assert fd.year == u'1780', fd
+        assert fd.qualifier == u"Note 'circa' : c.1780", fd
+
+        fd = parse('c. 1780')
+        assert fd.year == u'1780', fd
+        assert fd.qualifier.startswith(u"Note 'circa'"), fd
+
+    def test_ambiguous(self):
+        # TODO: have to be careful here ...
+        fd = parse('1068/1069')
+
+    def test_small_years(self):
+        in1 = '23'
+        fd = parse(in1)
+        assert str(fd) == '0023', fd
+        assert fd.as_float() == 23, fd.as_float()
+
+    def test_small_years_with_zeros(self):
+        in1 = '0023'
+        fd = parse(in1)
+        assert str(fd) == '0023', fd
+        assert fd.as_float() == 23, fd.as_float()
+
+    def test_years_with_alpha_prefix(self):
+        in1 = "p1980"
+        fd = parse(in1)
+        assert str(fd) == "1980", fd
+        


--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/setup.py	Tue Sep 27 19:46:22 2011 +0100
@@ -0,0 +1,36 @@
+from setuptools import setup, find_packages
+
+import sys
+sys.path.insert(0, '.')
+from datautil import __version__, __doc__ as __long_description__
+
+setup(
+    name='datautil-date',
+    version=__version__,
+    license='MIT',
+    description='Date Utilities for Data Work',
+    long_description=__long_description__,
+    author='Open Knowledge Foundation',
+    author_email='info at okfn.org',
+    url='http://okfn.org/projects/datautil/',
+    download_url='http://bitbucket.org/okfn/datautil-date/',
+    install_requires=[
+        # python-dateutil 2.0 has different _parse method, so stick to 1.4.1
+        'python-dateutil>=1.0,<1.99',
+        # (optional) for excel handling
+        # xlrd
+        # (optional) for google docs handling
+        # gdata
+        ],
+    packages=find_packages(),
+    include_package_data=True,
+    zip_safe=False,
+    classifiers = [
+        'Development Status :: 5 - Production/Stable',
+        'Environment :: Console',
+        'Intended Audience :: Developers',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python',
+        'Topic :: Software Development :: Libraries :: Python Modules'
+    ],
+)

Repository URL: https://bitbucket.org/okfn/datautil-date/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.




More information about the ckan-changes mailing list