[ckan-changes] commit/ckanext-harvest: 3 new changesets

Bitbucket commits-noreply at bitbucket.org
Tue May 10 15:07:11 UTC 2011


3 new changesets in ckanext-harvest:

http://bitbucket.org/okfn/ckanext-harvest/changeset/0b88a7fd298a/
changeset:   r87:0b88a7fd298a
user:        amercader
date:        2011-05-10 12:05:44
summary:     Add a reference date to the Harvest Objects. This must be set during the harvest
process.
affected #:  2 files (236 bytes)

--- a/ckanext/harvest/interfaces.py	Mon May 09 18:47:30 2011 +0100
+++ b/ckanext/harvest/interfaces.py	Tue May 10 11:05:44 2011 +0100
@@ -24,7 +24,10 @@
             - gathering all the necessary objects to fetch on a later.
               stage (e.g. for a CSW server, perform a GetRecords request)
             - creating the necessary HarvestObjects in the database, specifying
-              the guid and a reference to its source and job.
+              the guid and a reference to its job. The HarvestObjects need a
+              reference date with the last modified date for the resource, this
+              may need to be set in a different stage depending on the type of
+              source.
             - creating and storing any suitable HarvestGatherErrors that may
               occur.
             - returning a list with all the ids of the created HarvestObjects.


--- a/ckanext/harvest/model/__init__.py	Mon May 09 18:47:30 2011 +0100
+++ b/ckanext/harvest/model/__init__.py	Tue May 10 11:05:44 2011 +0100
@@ -119,6 +119,7 @@
 harvest_object_table = Table('harvest_object', metadata,
     Column('id', types.UnicodeText, primary_key=True, default=make_uuid),
     Column('guid', types.UnicodeText, default=''),
+    Column('reference_date', DateTime),
     Column('created', DateTime, default=datetime.datetime.utcnow),
     Column('content', types.UnicodeText, nullable=True),
     Column('harvest_job_id', types.UnicodeText, ForeignKey('harvest_job.id')),


http://bitbucket.org/okfn/ckanext-harvest/changeset/643775c2ca44/
changeset:   r88:643775c2ca44
user:        amercader
date:        2011-05-10 13:57:57
summary:     Save reference date in Harvest Objects when harvesting CKAN instances
affected #:  1 file (70 bytes)

--- a/ckanext/harvest/harvesters.py	Tue May 10 11:05:44 2011 +0100
+++ b/ckanext/harvest/harvesters.py	Tue May 10 12:57:57 2011 +0100
@@ -1,5 +1,14 @@
 import urllib2
 
+from ckan.logic.action.create import package_create_rest
+from ckan.logic.action.update import package_update_rest
+from ckan.logic.action.get import package_show
+from ckan.logic.schema import default_package_schema
+from ckan.logic import ValidationError,NotFound
+from ckan import model
+from ckan.model import Session
+from ckan.lib.navl.validators import ignore_missing
+
 from ckan.lib.helpers import json
 
 from ckan.plugins.core import SingletonPlugin, implements
@@ -134,19 +143,15 @@
             self._save_object_error('Empty content for object %s' % harvest_object.id,harvest_object,'Import')
             return False
         try:
-            from ckan.logic.action.create import package_create_rest
-            from ckan.logic.action.update import package_update_rest
-            from ckan.logic.action.get import package_show
-            from ckan.logic.schema import default_package_schema
-            from ckan.logic import ValidationError,NotFound
-            from ckan import model
-            from ckan.model import Session
-            from ckan.lib.navl.validators import ignore_missing
 
             # harvest_object.content is the result of an API call like
             # http://ec2-46-51-149-132.eu-west-1.compute.amazonaws.com:8081/api/2/rest/package/77d93608-3a3e-42e5-baab-3521afb504f1
             package_dict = json.loads(harvest_object.content)
 
+            # Save reference date in Harvest Object
+            harvest_object.reference_date = package_dict['metadata_modified']
+            harvest_object.save()
+
             ## change default schema
             schema = default_package_schema()
             schema["id"] = [ignore_missing, unicode]


http://bitbucket.org/okfn/ckanext-harvest/changeset/d6573f79b303/
changeset:   r89:d6573f79b303
user:        amercader
date:        2011-05-10 17:06:57
summary:     Add command to reimport existing harvest objects
affected #:  2 files (1.9 KB)

--- a/ckanext/harvest/commands/harvester.py	Tue May 10 12:57:57 2011 +0100
+++ b/ckanext/harvest/commands/harvester.py	Tue May 10 16:06:57 2011 +0100
@@ -38,7 +38,12 @@
 
       harvester fetch_consumer
         - starts the consumer for the fetching queue
-       
+
+      harvester import [{source-id}]
+        - perform the import stage with the last fetched objects, optionally belonging to a certain source.
+          Please note that no objects will be fetched from the remote server. It will only affect
+          the last fetched objects already present in the database.
+
     The commands should be run from the ckanext-harvest directory and expect
     a development.ini file to be present. Most of the time you will
     specify the config explicitly though::
@@ -82,9 +87,10 @@
             logging.getLogger('amqplib').setLevel(logging.INFO)
             consumer = get_fetch_consumer()
             consumer.wait()
-        elif cmd == "initdb":
+        elif cmd == 'initdb':
             self.initdb()
-
+        elif cmd == 'import':
+            self.import_stage()
         else:
             print 'Command %s not recognized' % cmd
 
@@ -185,9 +191,16 @@
             jobs = run_harvest_jobs()
         except:
             pass
-        sys.exit(1)
+        sys.exit(0)
         #print 'Sent %s jobs to the gather queue' % len(jobs)
 
+    def import_stage(self):
+        if len(self.args) >= 2:
+            source_id = unicode(self.args[1])
+        else:
+            source_id = None
+        import_last_objects(source_id)
+
     def print_harvest_sources(self, sources):
         if sources:
             print ''


--- a/ckanext/harvest/lib/__init__.py	Tue May 10 12:57:57 2011 +0100
+++ b/ckanext/harvest/lib/__init__.py	Tue May 10 16:06:57 2011 +0100
@@ -2,10 +2,11 @@
 from sqlalchemy import distinct,func
 from ckan.model import Session, repo
 from ckan.model import Package
-
+from ckan.plugins import PluginImplementations
 from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject, \
                                   HarvestGatherError, HarvestObjectError
 from ckanext.harvest.queue import get_gather_publisher
+from ckanext.harvest.interfaces import IHarvester
 
 log = __import__("logging").getLogger(__name__)
 
@@ -314,3 +315,37 @@
     objects = HarvestObject.filter(**kwds).all()
     return [_object_as_dict(obj) for obj in objects]
 
+def import_last_objects(source_id=None):
+    if source_id:
+        try:
+            source = HarvestSource.get(source_id)
+        except:
+            raise Exception('Source %s does not exist' % source_id)
+        last_objects = Session.query(HarvestObject) \
+                .join(HarvestJob) \
+                .filter(HarvestJob.source==source) \
+                .filter(HarvestObject.package!=None) \
+                .order_by(HarvestObject.guid) \
+                .order_by(HarvestObject.reference_date.desc()) \
+                .order_by(HarvestObject.created.desc()) \
+                .all()
+    else:
+        last_objects = Session.query(HarvestObject) \
+                .filter(HarvestObject.package!=None) \
+                .order_by(HarvestObject.guid) \
+                .order_by(HarvestObject.reference_date.desc()) \
+                .order_by(HarvestObject.created.desc()) \
+                .all()
+
+
+    last_obj_guid = ''
+    imported_objects = []
+    for obj in last_objects:
+        if obj.guid != last_obj_guid:
+            imported_objects.append(obj)
+            for harvester in PluginImplementations(IHarvester):
+                if harvester.get_type() == obj.job.source.type:
+                    harvester.import_stage(obj)
+        last_obj_guid = obj.guid
+
+    return imported_objects

Repository URL: https://bitbucket.org/okfn/ckanext-harvest/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.




More information about the ckan-changes mailing list