[ckan-changes] commit/ckanextiati: 3 new changesets

Wed Nov 2 14:58:39 UTC 2011

3 new commits in ckanextiati:


https://bitbucket.org/okfn/ckanextiati/changeset/600314bc9e6f/
changeset:   600314bc9e6f
branch:      resource-archiver
user:        amercader
date:        2011-11-01 13:54:57
summary:     [schema] Remove isodate validators in db_to_form schema

The archiver adds some date fields to the resource table. If you use a
custom db_to_form schema, an Exception is thrown because they can not be
converted to JSON.
affected #:  1 file

diff -r 8a6bb9a2a337b3515786cca5cce445d24d1e87e2 -r 600314bc9e6f507f07a3810eddfc78b89cf67b25 ckanext/iati/controllers/package_iati.py

--- a/ckanext/iati/controllers/package_iati.py
+++ b/ckanext/iati/controllers/package_iati.py
@@ -66,6 +66,12 @@
             'verified': [convert_from_extras,ignore_missing],
             'language': [convert_from_extras, ignore_missing],
         })
+        # Remove isodate validator
+        schema['resources'].update({
+            'last_modified': [ignore_missing],
+            'cache_last_updated': [ignore_missing],
+            'webstore_last_updated': [ignore_missing]
+        })
 
         return schema
 



https://bitbucket.org/okfn/ckanextiati/changeset/4e14f4683381/
changeset:   4e14f4683381
branch:      resource-archiver
user:        amercader
date:        2011-11-02 14:52:29
summary:     First version of the archiver command

A new paster command that will get all packages, download their resources,
parse the XML files and extract activity_count and data_updated.
Still need to work out of way of logging and providing a summary of the
process.
affected #:  2 files

diff -r 600314bc9e6f507f07a3810eddfc78b89cf67b25 -r 4e14f46833811f3d02efdc551450f37d376b4910 ckanext/iati/commands.py
--- /dev/null
+++ b/ckanext/iati/commands.py
@@ -0,0 +1,193 @@
+import os
+from lxml import etree
+import requests
+import json
+from pylons import config
+from ckan.lib.cli import CkanCommand
+from ckan.logic import get_action
+from ckan import model
+
+from ckan.lib.helpers import date_str_to_datetime
+from ckanext.archiver import tasks
+import logging
+
+log = logging.getLogger(__name__)
+
+class Archiver(CkanCommand):
+    '''
+    Download and save copies of all IATI activity files, extract some metrics
+    from them and store them as extras.
+
+    Usage:
+
+        paster iati-archiver update [{package-id}]
+           - Archive all activity files or just those belonging to a specific package
+             if a package id is provided
+
+    '''
+    summary = __doc__.split('\n')[0]
+    usage = __doc__
+    min_args = 0
+    max_args = 2
+    pkg_names = []
+
+    def command(self):
+        '''
+        Parse command line arguments and call appropriate method.
+        '''
+        if not self.args or self.args[0] in ['--help', '-h', 'help']:
+            print Archiver.__doc__
+            return
+
+        cmd = self.args[0]
+        self._load_config()
+        # TODO: use this when it gets to default ckan
+        # username = get_action('get_site_user')({'model': model, 'ignore_auth': True}, {})
+        context = {
+            'model': model,
+            'session':model.Session,
+            'site_url':config.get('ckan.site_url'),
+            'user': config.get('iati.admin_user.name'),
+            'apikey': config.get('iati.admin_user.api_key')
+        }
+
+        if cmd == 'update':
+            if len(self.args) > 1:
+                packages = [unicode(self.args[1])]
+
+            else:
+                packages = get_action('package_list')(context, {})
+
+            data_formats = tasks.DATA_FORMATS
+            data_formats.append('iati-xml')
+
+            log.info('Number of datasets to archive: %d' % len(packages))
+            updated = 0
+            for package_id in packages:
+                package = get_action('package_show_rest')(context,{'id': package_id})
+
+
+                is_activity_package = (package['extras']['filetype'] == 'activity') if 'filetype' in package['extras'] else 'activity'
+
+                log.info('Archiving dataset: %s (%d resources)' % (package.get('name'), len(package.get('resources', []))))
+                for resource in package.get('resources', []):
+
+                    if not resource.get('url',''):
+                        log.error('Resource for dataset %s does not have URL' % package['name'])
+                        continue
+
+                    try:
+                        result = tasks.download(context,resource,data_formats=data_formats)
+                    except tasks.LinkCheckerError,e:
+                        if 'method not allowed' in str(e).lower():
+                            # The DFID server does not support HEAD requests*,
+                            # so we need to handle the download manually
+                            # * But only the first time a file is downloaded!?
+                            result = _download_resource(context,resource,data_formats=data_formats)
+                        else:
+                            log.error('Invalid resource URL: %s' % str(e))
+                            continue
+                    except tasks.DownloadError:
+                        log.error('Error downloading resource: %s' % str(e))
+                        continue
+
+                    if 'zip' in result['headers']['content-type']:
+                        # Skip zipped files for now
+                        log.info('Skipping zipped file for dataset %s ' % package.get('name'))
+                        continue
+
+                    file_path = result['saved_file']
+                    f = open(file_path,'r')
+                    xml = f.read()
+                    f.close()
+                    os.remove(file_path)
+                    try:
+                        tree = etree.fromstring(xml)
+                    except etree.XMLSyntaxError,e:
+                        log.error('Could not parse XML file for dataset %s: %s' % (package['name'],str(e)))
+                        continue
+
+                    new_extras = {}
+                    if is_activity_package:
+                        # Number of activities (activity_count extra)
+                        new_extras['activity_count'] = int(tree.xpath('count(//iati-activity)'))
+
+                    # Last updated date (data_updated extra)
+                    if is_activity_package:
+                        xpath = 'iati-activity/@last-updated-datetime'
+                    else:
+                        xpath = 'iati-organisation/@last-updated-datetime'
+                    dates = tree.xpath(xpath) or []
+
+                    sorted(dates,reverse=True)
+                    last_updated_date = dates[0] if len(dates) else None
+
+                    # Check dates
+                    if last_updated_date:
+                        # Get rid of the microseconds
+                        if '.' in last_updated_date:
+                            last_updated_date = last_updated_date[:last_updated_date.find('.')]
+                        try:
+                            date = date_str_to_datetime(last_updated_date)
+                            format = '%Y-%m-%d %H:%M' if (date.hour and date.minute) else '%Y-%m-%d'
+                            new_extras['data_updated'] = date.strftime(format)
+                        except (ValueError,TypeError),e:
+                            log.error('Wrong date format for data_updated: %s' % str(e))
+
+
+                    update = False
+                    for key,value in new_extras.iteritems():
+                        if value and (not key in package['extras'] or value != package['extras'][key]):
+                            update = True
+                            old_value = package['extras'][key] if key in package['extras'] else '""'
+                            log.info('Updated extra %s for dataset %s: %s -> %s' % (key,package['name'],old_value,value))
+                            package['extras'][key] = value
+
+                    if update:
+                        context['id'] = package['id']
+                        updated_package = get_action('package_update_rest')(context,package)
+                        log.info('Package %s updated with new extras' % package['name'])
+                        updated = updated + 1
+            log.info('Done. Updated %i packages' % updated)
+        else:
+            log.error('Command %s not recognized' % (cmd,))
+
+def _download_resource(context,resource, max_content_length=50000000, url_timeout=30,data_formats=['xml','iati-xml']):
+
+    # get the resource and archive it
+    #logger.info("Resource identified as data file, attempting to archive")
+    res = requests.get(resource['url'], timeout = url_timeout)
+
+    headers = res.headers
+    resource_format = resource['format'].lower()
+    ct = headers.get('content-type', '').lower()
+    cl = headers.get('content-length')
+
+    resource_changed = (resource.get('mimetype') != ct) or (resource.get('size') != cl)
+    if resource_changed:
+        resource['mimetype'] = ct
+        resource['size'] = cl
+
+    length, hash, saved_file = tasks._save_resource(resource, res, max_content_length)
+
+    # check that resource did not exceed maximum size when being saved
+    # (content-length header could have been invalid/corrupted, or not accurate
+    # if resource was streamed)
+    #
+    # TODO: remove partially archived file in this case
+    if length >= max_content_length:
+        if resource_changed:
+            tasks._update_resource(context, resource)
+        # record fact that resource is too large to archive
+        raise tasks.DownloadError("Content-length after streaming reached maximum allowed value of %s" %
+            max_content_length)
+
+    # update the resource metadata in CKAN
+    resource['hash'] = hash
+    tasks._update_resource(context, resource)
+
+    return {'length': length,
+            'hash' :hash,
+            'headers': headers,
+            'saved_file': saved_file}
+


diff -r 600314bc9e6f507f07a3810eddfc78b89cf67b25 -r 4e14f46833811f3d02efdc551450f37d376b4910 setup.py
--- a/setup.py
+++ b/setup.py
@@ -35,6 +35,6 @@
       
       [paste.paster_command]
       create-iati-fixtures = ckanext.iati.fixtures:CreateIatiFixtures
-      
+      iati-archiver=ckanext.iati.commands:Archiver
       """,
       )



https://bitbucket.org/okfn/ckanextiati/changeset/e6fd499ac9c3/
changeset:   e6fd499ac9c3
branch:      resource-archiver
user:        amercader
date:        2011-11-02 15:58:08
summary:     Add time spent to the log
affected #:  1 file

diff -r 4e14f46833811f3d02efdc551450f37d376b4910 -r e6fd499ac9c36c35867b0cc706bd6136fa729de7 ckanext/iati/commands.py
--- a/ckanext/iati/commands.py
+++ b/ckanext/iati/commands.py
@@ -1,4 +1,5 @@
 import os
+import datetime
 from lxml import etree
 import requests
 import json
@@ -13,6 +14,7 @@
 
 log = logging.getLogger(__name__)
 
+
 class Archiver(CkanCommand):
     '''
     Download and save copies of all IATI activity files, extract some metrics
@@ -39,6 +41,8 @@
             print Archiver.__doc__
             return
 
+        t1 = datetime.datetime.now()
+
         cmd = self.args[0]
         self._load_config()
         # TODO: use this when it gets to default ckan
@@ -148,7 +152,10 @@
                         updated_package = get_action('package_update_rest')(context,package)
                         log.info('Package %s updated with new extras' % package['name'])
                         updated = updated + 1
-            log.info('Done. Updated %i packages' % updated)
+
+            t2 = datetime.datetime.now()
+
+            log.info('Done. Updated %i packages. Total time: %s' % (updated,str(t2 - t1)))
         else:
             log.error('Command %s not recognized' % (cmd,))

Repository URL: https://bitbucket.org/okfn/ckanextiati/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.