[ckan-changes] commit/ckanextiati: 3 new changesets
Bitbucket
commits-noreply at bitbucket.org
Wed Nov 2 14:58:39 UTC 2011
3 new commits in ckanextiati:
https://bitbucket.org/okfn/ckanextiati/changeset/600314bc9e6f/
changeset: 600314bc9e6f
branch: resource-archiver
user: amercader
date: 2011-11-01 13:54:57
summary: [schema] Remove isodate validators in db_to_form schema
The archiver adds some date fields to the resource table. If you use a
custom db_to_form schema, an Exception is thrown because they can not be
converted to JSON.
affected #: 1 file
diff -r 8a6bb9a2a337b3515786cca5cce445d24d1e87e2 -r 600314bc9e6f507f07a3810eddfc78b89cf67b25 ckanext/iati/controllers/package_iati.py
--- a/ckanext/iati/controllers/package_iati.py
+++ b/ckanext/iati/controllers/package_iati.py
@@ -66,6 +66,12 @@
'verified': [convert_from_extras,ignore_missing],
'language': [convert_from_extras, ignore_missing],
})
+ # Remove isodate validator
+ schema['resources'].update({
+ 'last_modified': [ignore_missing],
+ 'cache_last_updated': [ignore_missing],
+ 'webstore_last_updated': [ignore_missing]
+ })
return schema
https://bitbucket.org/okfn/ckanextiati/changeset/4e14f4683381/
changeset: 4e14f4683381
branch: resource-archiver
user: amercader
date: 2011-11-02 14:52:29
summary: First version of the archiver command
A new paster command that will get all packages, download their resources,
parse the XML files and extract activity_count and data_updated.
Still need to work out of way of logging and providing a summary of the
process.
affected #: 2 files
diff -r 600314bc9e6f507f07a3810eddfc78b89cf67b25 -r 4e14f46833811f3d02efdc551450f37d376b4910 ckanext/iati/commands.py
--- /dev/null
+++ b/ckanext/iati/commands.py
@@ -0,0 +1,193 @@
+import os
+from lxml import etree
+import requests
+import json
+from pylons import config
+from ckan.lib.cli import CkanCommand
+from ckan.logic import get_action
+from ckan import model
+
+from ckan.lib.helpers import date_str_to_datetime
+from ckanext.archiver import tasks
+import logging
+
+log = logging.getLogger(__name__)
+
+class Archiver(CkanCommand):
+ '''
+ Download and save copies of all IATI activity files, extract some metrics
+ from them and store them as extras.
+
+ Usage:
+
+ paster iati-archiver update [{package-id}]
+ - Archive all activity files or just those belonging to a specific package
+ if a package id is provided
+
+ '''
+ summary = __doc__.split('\n')[0]
+ usage = __doc__
+ min_args = 0
+ max_args = 2
+ pkg_names = []
+
+ def command(self):
+ '''
+ Parse command line arguments and call appropriate method.
+ '''
+ if not self.args or self.args[0] in ['--help', '-h', 'help']:
+ print Archiver.__doc__
+ return
+
+ cmd = self.args[0]
+ self._load_config()
+ # TODO: use this when it gets to default ckan
+ # username = get_action('get_site_user')({'model': model, 'ignore_auth': True}, {})
+ context = {
+ 'model': model,
+ 'session':model.Session,
+ 'site_url':config.get('ckan.site_url'),
+ 'user': config.get('iati.admin_user.name'),
+ 'apikey': config.get('iati.admin_user.api_key')
+ }
+
+ if cmd == 'update':
+ if len(self.args) > 1:
+ packages = [unicode(self.args[1])]
+
+ else:
+ packages = get_action('package_list')(context, {})
+
+ data_formats = tasks.DATA_FORMATS
+ data_formats.append('iati-xml')
+
+ log.info('Number of datasets to archive: %d' % len(packages))
+ updated = 0
+ for package_id in packages:
+ package = get_action('package_show_rest')(context,{'id': package_id})
+
+
+ is_activity_package = (package['extras']['filetype'] == 'activity') if 'filetype' in package['extras'] else 'activity'
+
+ log.info('Archiving dataset: %s (%d resources)' % (package.get('name'), len(package.get('resources', []))))
+ for resource in package.get('resources', []):
+
+ if not resource.get('url',''):
+ log.error('Resource for dataset %s does not have URL' % package['name'])
+ continue
+
+ try:
+ result = tasks.download(context,resource,data_formats=data_formats)
+ except tasks.LinkCheckerError,e:
+ if 'method not allowed' in str(e).lower():
+ # The DFID server does not support HEAD requests*,
+ # so we need to handle the download manually
+ # * But only the first time a file is downloaded!?
+ result = _download_resource(context,resource,data_formats=data_formats)
+ else:
+ log.error('Invalid resource URL: %s' % str(e))
+ continue
+ except tasks.DownloadError:
+ log.error('Error downloading resource: %s' % str(e))
+ continue
+
+ if 'zip' in result['headers']['content-type']:
+ # Skip zipped files for now
+ log.info('Skipping zipped file for dataset %s ' % package.get('name'))
+ continue
+
+ file_path = result['saved_file']
+ f = open(file_path,'r')
+ xml = f.read()
+ f.close()
+ os.remove(file_path)
+ try:
+ tree = etree.fromstring(xml)
+ except etree.XMLSyntaxError,e:
+ log.error('Could not parse XML file for dataset %s: %s' % (package['name'],str(e)))
+ continue
+
+ new_extras = {}
+ if is_activity_package:
+ # Number of activities (activity_count extra)
+ new_extras['activity_count'] = int(tree.xpath('count(//iati-activity)'))
+
+ # Last updated date (data_updated extra)
+ if is_activity_package:
+ xpath = 'iati-activity/@last-updated-datetime'
+ else:
+ xpath = 'iati-organisation/@last-updated-datetime'
+ dates = tree.xpath(xpath) or []
+
+ sorted(dates,reverse=True)
+ last_updated_date = dates[0] if len(dates) else None
+
+ # Check dates
+ if last_updated_date:
+ # Get rid of the microseconds
+ if '.' in last_updated_date:
+ last_updated_date = last_updated_date[:last_updated_date.find('.')]
+ try:
+ date = date_str_to_datetime(last_updated_date)
+ format = '%Y-%m-%d %H:%M' if (date.hour and date.minute) else '%Y-%m-%d'
+ new_extras['data_updated'] = date.strftime(format)
+ except (ValueError,TypeError),e:
+ log.error('Wrong date format for data_updated: %s' % str(e))
+
+
+ update = False
+ for key,value in new_extras.iteritems():
+ if value and (not key in package['extras'] or value != package['extras'][key]):
+ update = True
+ old_value = package['extras'][key] if key in package['extras'] else '""'
+ log.info('Updated extra %s for dataset %s: %s -> %s' % (key,package['name'],old_value,value))
+ package['extras'][key] = value
+
+ if update:
+ context['id'] = package['id']
+ updated_package = get_action('package_update_rest')(context,package)
+ log.info('Package %s updated with new extras' % package['name'])
+ updated = updated + 1
+ log.info('Done. Updated %i packages' % updated)
+ else:
+ log.error('Command %s not recognized' % (cmd,))
+
+def _download_resource(context,resource, max_content_length=50000000, url_timeout=30,data_formats=['xml','iati-xml']):
+
+ # get the resource and archive it
+ #logger.info("Resource identified as data file, attempting to archive")
+ res = requests.get(resource['url'], timeout = url_timeout)
+
+ headers = res.headers
+ resource_format = resource['format'].lower()
+ ct = headers.get('content-type', '').lower()
+ cl = headers.get('content-length')
+
+ resource_changed = (resource.get('mimetype') != ct) or (resource.get('size') != cl)
+ if resource_changed:
+ resource['mimetype'] = ct
+ resource['size'] = cl
+
+ length, hash, saved_file = tasks._save_resource(resource, res, max_content_length)
+
+ # check that resource did not exceed maximum size when being saved
+ # (content-length header could have been invalid/corrupted, or not accurate
+ # if resource was streamed)
+ #
+ # TODO: remove partially archived file in this case
+ if length >= max_content_length:
+ if resource_changed:
+ tasks._update_resource(context, resource)
+ # record fact that resource is too large to archive
+ raise tasks.DownloadError("Content-length after streaming reached maximum allowed value of %s" %
+ max_content_length)
+
+ # update the resource metadata in CKAN
+ resource['hash'] = hash
+ tasks._update_resource(context, resource)
+
+ return {'length': length,
+ 'hash' :hash,
+ 'headers': headers,
+ 'saved_file': saved_file}
+
diff -r 600314bc9e6f507f07a3810eddfc78b89cf67b25 -r 4e14f46833811f3d02efdc551450f37d376b4910 setup.py
--- a/setup.py
+++ b/setup.py
@@ -35,6 +35,6 @@
[paste.paster_command]
create-iati-fixtures = ckanext.iati.fixtures:CreateIatiFixtures
-
+ iati-archiver=ckanext.iati.commands:Archiver
""",
)
https://bitbucket.org/okfn/ckanextiati/changeset/e6fd499ac9c3/
changeset: e6fd499ac9c3
branch: resource-archiver
user: amercader
date: 2011-11-02 15:58:08
summary: Add time spent to the log
affected #: 1 file
diff -r 4e14f46833811f3d02efdc551450f37d376b4910 -r e6fd499ac9c36c35867b0cc706bd6136fa729de7 ckanext/iati/commands.py
--- a/ckanext/iati/commands.py
+++ b/ckanext/iati/commands.py
@@ -1,4 +1,5 @@
import os
+import datetime
from lxml import etree
import requests
import json
@@ -13,6 +14,7 @@
log = logging.getLogger(__name__)
+
class Archiver(CkanCommand):
'''
Download and save copies of all IATI activity files, extract some metrics
@@ -39,6 +41,8 @@
print Archiver.__doc__
return
+ t1 = datetime.datetime.now()
+
cmd = self.args[0]
self._load_config()
# TODO: use this when it gets to default ckan
@@ -148,7 +152,10 @@
updated_package = get_action('package_update_rest')(context,package)
log.info('Package %s updated with new extras' % package['name'])
updated = updated + 1
- log.info('Done. Updated %i packages' % updated)
+
+ t2 = datetime.datetime.now()
+
+ log.info('Done. Updated %i packages. Total time: %s' % (updated,str(t2 - t1)))
else:
log.error('Command %s not recognized' % (cmd,))
Repository URL: https://bitbucket.org/okfn/ckanextiati/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
More information about the ckan-changes
mailing list