[ckan-changes] commit/ckanext-harvest: 3 new changesets
Bitbucket
commits-noreply at bitbucket.org
Tue May 10 15:07:11 UTC 2011
3 new changesets in ckanext-harvest:
http://bitbucket.org/okfn/ckanext-harvest/changeset/0b88a7fd298a/
changeset: r87:0b88a7fd298a
user: amercader
date: 2011-05-10 12:05:44
summary: Add a reference date to the Harvest Objects. This must be set during the harvest
process.
affected #: 2 files (236 bytes)
--- a/ckanext/harvest/interfaces.py Mon May 09 18:47:30 2011 +0100
+++ b/ckanext/harvest/interfaces.py Tue May 10 11:05:44 2011 +0100
@@ -24,7 +24,10 @@
- gathering all the necessary objects to fetch on a later.
stage (e.g. for a CSW server, perform a GetRecords request)
- creating the necessary HarvestObjects in the database, specifying
- the guid and a reference to its source and job.
+ the guid and a reference to its job. The HarvestObjects need a
+ reference date with the last modified date for the resource, this
+ may need to be set in a different stage depending on the type of
+ source.
- creating and storing any suitable HarvestGatherErrors that may
occur.
- returning a list with all the ids of the created HarvestObjects.
--- a/ckanext/harvest/model/__init__.py Mon May 09 18:47:30 2011 +0100
+++ b/ckanext/harvest/model/__init__.py Tue May 10 11:05:44 2011 +0100
@@ -119,6 +119,7 @@
harvest_object_table = Table('harvest_object', metadata,
Column('id', types.UnicodeText, primary_key=True, default=make_uuid),
Column('guid', types.UnicodeText, default=''),
+ Column('reference_date', DateTime),
Column('created', DateTime, default=datetime.datetime.utcnow),
Column('content', types.UnicodeText, nullable=True),
Column('harvest_job_id', types.UnicodeText, ForeignKey('harvest_job.id')),
http://bitbucket.org/okfn/ckanext-harvest/changeset/643775c2ca44/
changeset: r88:643775c2ca44
user: amercader
date: 2011-05-10 13:57:57
summary: Save reference date in Harvest Objects when harvesting CKAN instances
affected #: 1 file (70 bytes)
--- a/ckanext/harvest/harvesters.py Tue May 10 11:05:44 2011 +0100
+++ b/ckanext/harvest/harvesters.py Tue May 10 12:57:57 2011 +0100
@@ -1,5 +1,14 @@
import urllib2
+from ckan.logic.action.create import package_create_rest
+from ckan.logic.action.update import package_update_rest
+from ckan.logic.action.get import package_show
+from ckan.logic.schema import default_package_schema
+from ckan.logic import ValidationError,NotFound
+from ckan import model
+from ckan.model import Session
+from ckan.lib.navl.validators import ignore_missing
+
from ckan.lib.helpers import json
from ckan.plugins.core import SingletonPlugin, implements
@@ -134,19 +143,15 @@
self._save_object_error('Empty content for object %s' % harvest_object.id,harvest_object,'Import')
return False
try:
- from ckan.logic.action.create import package_create_rest
- from ckan.logic.action.update import package_update_rest
- from ckan.logic.action.get import package_show
- from ckan.logic.schema import default_package_schema
- from ckan.logic import ValidationError,NotFound
- from ckan import model
- from ckan.model import Session
- from ckan.lib.navl.validators import ignore_missing
# harvest_object.content is the result of an API call like
# http://ec2-46-51-149-132.eu-west-1.compute.amazonaws.com:8081/api/2/rest/package/77d93608-3a3e-42e5-baab-3521afb504f1
package_dict = json.loads(harvest_object.content)
+ # Save reference date in Harvest Object
+ harvest_object.reference_date = package_dict['metadata_modified']
+ harvest_object.save()
+
## change default schema
schema = default_package_schema()
schema["id"] = [ignore_missing, unicode]
http://bitbucket.org/okfn/ckanext-harvest/changeset/d6573f79b303/
changeset: r89:d6573f79b303
user: amercader
date: 2011-05-10 17:06:57
summary: Add command to reimport existing harvest objects
affected #: 2 files (1.9 KB)
--- a/ckanext/harvest/commands/harvester.py Tue May 10 12:57:57 2011 +0100
+++ b/ckanext/harvest/commands/harvester.py Tue May 10 16:06:57 2011 +0100
@@ -38,7 +38,12 @@
harvester fetch_consumer
- starts the consumer for the fetching queue
-
+
+ harvester import [{source-id}]
+ - perform the import stage with the last fetched objects, optionally belonging to a certain source.
+ Please note that no objects will be fetched from the remote server. It will only affect
+ the last fetched objects already present in the database.
+
The commands should be run from the ckanext-harvest directory and expect
a development.ini file to be present. Most of the time you will
specify the config explicitly though::
@@ -82,9 +87,10 @@
logging.getLogger('amqplib').setLevel(logging.INFO)
consumer = get_fetch_consumer()
consumer.wait()
- elif cmd == "initdb":
+ elif cmd == 'initdb':
self.initdb()
-
+ elif cmd == 'import':
+ self.import_stage()
else:
print 'Command %s not recognized' % cmd
@@ -185,9 +191,16 @@
jobs = run_harvest_jobs()
except:
pass
- sys.exit(1)
+ sys.exit(0)
#print 'Sent %s jobs to the gather queue' % len(jobs)
+ def import_stage(self):
+ if len(self.args) >= 2:
+ source_id = unicode(self.args[1])
+ else:
+ source_id = None
+ import_last_objects(source_id)
+
def print_harvest_sources(self, sources):
if sources:
print ''
--- a/ckanext/harvest/lib/__init__.py Tue May 10 12:57:57 2011 +0100
+++ b/ckanext/harvest/lib/__init__.py Tue May 10 16:06:57 2011 +0100
@@ -2,10 +2,11 @@
from sqlalchemy import distinct,func
from ckan.model import Session, repo
from ckan.model import Package
-
+from ckan.plugins import PluginImplementations
from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject, \
HarvestGatherError, HarvestObjectError
from ckanext.harvest.queue import get_gather_publisher
+from ckanext.harvest.interfaces import IHarvester
log = __import__("logging").getLogger(__name__)
@@ -314,3 +315,37 @@
objects = HarvestObject.filter(**kwds).all()
return [_object_as_dict(obj) for obj in objects]
+def import_last_objects(source_id=None):
+ if source_id:
+ try:
+ source = HarvestSource.get(source_id)
+ except:
+ raise Exception('Source %s does not exist' % source_id)
+ last_objects = Session.query(HarvestObject) \
+ .join(HarvestJob) \
+ .filter(HarvestJob.source==source) \
+ .filter(HarvestObject.package!=None) \
+ .order_by(HarvestObject.guid) \
+ .order_by(HarvestObject.reference_date.desc()) \
+ .order_by(HarvestObject.created.desc()) \
+ .all()
+ else:
+ last_objects = Session.query(HarvestObject) \
+ .filter(HarvestObject.package!=None) \
+ .order_by(HarvestObject.guid) \
+ .order_by(HarvestObject.reference_date.desc()) \
+ .order_by(HarvestObject.created.desc()) \
+ .all()
+
+
+ last_obj_guid = ''
+ imported_objects = []
+ for obj in last_objects:
+ if obj.guid != last_obj_guid:
+ imported_objects.append(obj)
+ for harvester in PluginImplementations(IHarvester):
+ if harvester.get_type() == obj.job.source.type:
+ harvester.import_stage(obj)
+ last_obj_guid = obj.guid
+
+ return imported_objects
Repository URL: https://bitbucket.org/okfn/ckanext-harvest/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
More information about the ckan-changes
mailing list