[ckan-changes] commit/ckanext-pdeu: pudo: refactor harvesters to each be in a seperate file
Bitbucket
commits-noreply at bitbucket.org
Thu Jun 9 10:11:29 UTC 2011
1 new changeset in ckanext-pdeu:
http://bitbucket.org/okfn/ckanext-pdeu/changeset/40eaef9f86ca/
changeset: 40eaef9f86ca
user: pudo
date: 2011-06-09 12:11:20
summary: refactor harvesters to each be in a seperate file
affected #: 9 files (30.5 KB)
--- a/ckanext/pdeu/harvesters.py Thu Jun 09 11:35:09 2011 +0200
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,725 +0,0 @@
-#coding: utf-8
-import urllib2, urllib, urlparse
-import string
-from datetime import datetime
-
-import logging
-
-from ckan import model
-from ckan.model import Session
-from ckan.logic import ValidationError, NotFound
-from ckan.logic.action.update import package_update_rest
-
-from ckan.lib.helpers import json
-
-from ckanext.harvest.model import HarvestObject
-from ckanext.harvest.harvesters import HarvesterBase
-from ckanext.harvest.harvesters.ckanharvester import CKANHarvester
-from lxml import html, etree
-from cookielib import CookieJar
-
-log = logging.getLogger(__name__)
-
-
-class PDEUCKANHarvester(CKANHarvester):
-
- def info(self):
- return {
- 'name': 'ckan_pdeu',
- 'title': 'CKAN (PublicData.eu)',
- 'description': 'CKAN Harvester modified for PublicData.eu needs',
- 'form_config_interface':'Text'
- }
-
- def import_stage(self,harvest_object):
-
- super(PDEUCKANHarvester, self).import_stage(harvest_object)
-
- # Add some extras to the newly created package
- new_extras = {
- 'eu_country': self.config.get('eu_country',''),
- 'harvest_catalogue_name': self.config.get('harvest_catalogue_name',''),
- 'harvest_catalogue_url': harvest_object.job.source.url,
- 'harvest_dataset_url': harvest_object.job.source.url.strip('/') + '/package/' + harvest_object.package_id
- }
-
- for extra in ['eu_nuts1','eu_nuts2','eu_nuts3']:
- if self.config.get(extra,''):
- new_extras[extra] = self.config[extra]
-
- context = {
- 'model': model,
- 'session': Session,
- 'user': u'harvest',
- 'id': harvest_object.package_id
- }
-
- data_dict = {'extras':new_extras}
- package_update_rest(data_dict,context)
-
-class DataPublicaHarvester(HarvesterBase):
- INITIAL_INDEX = "http://www.data-publica.com/en/data/WebSection_viewContentDetailledList"
- INDEX_URL = "http://www.data-publica.com/en/data"
-
- def info(self):
- return {
- 'name': 'data_publica',
- 'title': 'Data Publica',
- 'description': 'Scraper for data-publica.com'
- }
-
- gathered_ids = []
- page = 1
-
- def _gather_ids(self,url = None, jar= None):
- log.debug('Page %s'%self.page)
- if jar is None:
- jar = CookieJar()
- opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
- url = url or self.INITIAL_INDEX
- fh = opener.open(url)
- doc = html.parse(fh)
- fh.close()
-
- new_ids = []
- for a in doc.findall(".//div[@class='main']//a"):
- href = a.get('href').split('?', 1)[0]
- id = href.split('/').pop()
- if not id in self.gathered_ids:
- log.debug('Got Id: %s' % id)
- #self.queue(DataPublicaDatasetCrawler, url=href)
- new_ids.append(id)
-
- if len(new_ids) == 0: # or self.page == 2:
- return self.gathered_ids
- else:
- self.gathered_ids.extend(new_ids)
-
- inputs = []
- for input in doc.findall(".//form[@id='main_form']//input"):
- inputs.append((input.get('name'), input.get('value')))
- inputs.append(('listbox_nextPage:method', ''))
- next_url = self.INDEX_URL + '?' + urllib.urlencode(inputs)
- self.page = self.page + 1
- return self._gather_ids(url=next_url,jar=jar)
-
- def gather_stage(self,harvest_job):
- log.debug('In DataPublica gather_stage (%s)' % harvest_job.source.url)
-
- remote_ids = self._gather_ids(self.INITIAL_INDEX)
- #remote_ids = ['20110524-36F426','20110524-10821AB','20110523-10DACE3']
-
- return self._create_harvest_objects(remote_ids,harvest_job)
-
-
- def fetch_stage(self,harvest_object):
- log.debug('In DataPublicaHarvester fetch_stage')
- # Get URL
- url = harvest_object.source.url.rstrip('/')
- url = url + '/en/data_set_module/' + harvest_object.guid
-
- # Get contents
- try:
- fh = urllib2.urlopen(url)
- harvest_object.content = fh.read()
- harvest_object.save()
- fh.close()
- return True
- except Exception,e:
- log.exception(e)
- self._save_object_error('Unable to get content for dataset: %s: %r' % \
- (url, e),harvest_object)
-
- def import_stage(self, harvest_object):
- log.debug('In DataPublicaHarvester import_stage')
- if not harvest_object:
- log.error('No harvest object received')
- return False
-
- if harvest_object.content is None:
- self._save_object_error('Empty content for object %s' % harvest_object.id,harvest_object,'Import')
- return False
- try:
- package_dict = {}
- extras_dict = {}
-
- #TODO: Avoid collisions?
- package_dict['id'] = harvest_object.guid
- doc = html.document_fromstring(harvest_object.content)
- for field in doc.findall(".//div"):
- if not 'field' in field.get('class', ''): continue
- name = field.find("label").text.strip()
-
- if name == 'Title':
- package_dict['title'] = field.find("div").xpath("string()").strip()
-
- if name == 'Categories':
- extras_dict['categories'] = []
- for elem in field.findall("div[@class='input']"):
- if not elem.text: continue
- extras_dict['categories'].append(elem.text.strip())
-
- if name == 'Software Licence':
- #TODO: what to do with these?
- a = field.find("div/a")
- if a is not None:
- extras_dict['license_url'] = a.get('href')
- extras_dict['licence'] = a.text.strip()
-
- if name == 'Editor':
- a = field.find("div/a")
- if a is not None:
- package_dict['author'] = a.text.strip()
-
- if name == 'Deposit Date':
- text = field.find("div[@class='input']").xpath("string()")
- text = "".join([c for c in text if c in string.digits+"/:"])
- if len(text.strip()):
- extras_dict['deposit_date'] = datetime.strptime(text, "%d/%m/%Y%H:%M").isoformat()
-
- if name == 'Update Date':
- text = field.find("div[@class='input']").xpath("string()")
- text = "".join([c for c in text if c in string.digits+"/:"])
- if len(text.strip()):
- extras_dict['update_date'] = datetime.strptime(text, "%d/%m/%Y%H:%M").isoformat()
-
- if name == 'Frequency Update':
- text = field.find("div[@class='input']").xpath("string()")
- extras_dict['frequency_update'] = text.strip()
-
- if name == 'Tags':
- package_dict['tags'] = []
- for elem in field.find("div[@class='input']/div").iter():
- tag = None
- if elem.text:
- tag = elem.text.strip()
- if elem.tail:
- tag = elem.tail.strip()
- if tag:
- package_dict['tags'].append(tag)
-
- if name == 'Description':
- text = field.find("div[@class='input']/div").xpath("string()")
- package_dict['notes'] = text.strip()
-
- if name == 'URL':
- # This should link to the orginal URL
- package_dict['url'] = field.find("div/a").get('href')
-
- #FIELD Data Publications
- if name == 'Data Publications':
- package_dict['resources'] = []
- resource_descriptions = [a.text.strip() for a in field.findall(".//div[@class='data']/div[@class='main']//a")]
- resource_formats = [a.text.strip() for a in field.findall(".//div[@class='data']/div[@class='second']//a")]
- resource_links = [a.get('href') for a in field.findall(".//div[@class='icon']//a")]
- for i in range(len(resource_links)):
- package_dict['resources'].append({
- 'url':resource_links[i],
- 'format':resource_formats[i],
- 'description':resource_descriptions[i]
- })
-
-
- # Common extras
- extras_dict['harvest_catalogue_name'] = u'Data Publica'
- extras_dict['harvest_catalogue_url'] = u'http://www.data-publica.com'
- extras_dict['harvest_dataset_url'] = u'http://www.data-publica.com/en/data_set_module/%s' % harvest_object.guid
- extras_dict['eu_country'] = u'FR'
-
- package_dict['name'] = self._gen_new_name(package_dict['title'])
- package_dict['extras'] = extras_dict
-
- except Exception, e:
- log.exception(e)
- self._save_object_error('%r'%e, harvest_object, 'Import')
-
- return self._create_or_update_package(package_dict,harvest_object)
-
-
-from ckanext.rdf.consume import consume_one
-from ckanext.rdf.vocab import Graph
-try: from cStringIO import StringIO
-except ImportError: from StringIO import StringIO
-
-class OpenGovSeHarvester(HarvesterBase):
- INDEX_URL = "http://www.opengov.se/feeds/data/"
- ATOM_NS = "http://www.w3.org/2005/Atom"
-
- def info(self):
- return {
- 'name': 'opengov_se',
- 'title': 'OpenGov.se',
- 'description': 'Harvester for opengov.se'
- }
-
- def gather_stage(self,harvest_job):
- log.debug('In OpenGovSeHarvester gahter_stage')
- # Get feed contents
- doc = etree.parse(self.INDEX_URL)
- remote_ids = []
- for id_element in doc.findall('//{%(ns)s}entry/{%(ns)s}id' % {'ns':self.ATOM_NS}):
- id = id_element.text.strip()
- log.debug('Got id: %s' % id)
- remote_ids.append(id)
-
- return self._create_harvest_objects(remote_ids,harvest_job)
-
- def fetch_stage(self,harvest_object):
- log.debug('In OpenGovSeHarvester fetch_stage')
- url = harvest_object.guid.strip('/') + '/rdf/'
- try:
- fh = urllib2.urlopen(url)
- harvest_object.content = fh.read()
- harvest_object.save()
- fh.close()
- return True
- content = self._get_content(url)
- except Exception, e:
- log.exception(e)
- self._save_object_error('Unable to get content for dataset: %s: %r' % \
- (url, e), harvest_object)
-
- def import_stage(self,harvest_object):
- log.debug('In OpenGovSeHarvester import_stage')
- if not harvest_object:
- log.error('No harvest object received')
- return False
-
- if harvest_object.content is None:
- self._save_object_error('Empty content for object %s' % harvest_object.id,harvest_object,'Import')
- return False
-
- try:
- graph = Graph()
- graph.parse(StringIO(harvest_object.content))
-
- url = harvest_object.guid
- package_dict = consume_one(graph)
- except Exception, e:
- log.exception(e)
- self._save_object_error('%r'%e,harvest_object,'Import')
-
- package_dict['id'] = harvest_object.guid
- if not package_dict['name']:
- package_dict['name'] = self._gen_new_name(package_dict['title'])
-
- # Set the modification date
- if 'date_modified' in package_dict['extras']:
- package_dict['metadata_modified'] = package_dict['extras']['date_modified']
-
- # Common extras
- package_dict['extras']['harvest_catalogue_name'] = u'Opengov.se'
- package_dict['extras']['harvest_catalogue_url'] = u'http://www.opengov.se'
- package_dict['extras']['harvest_dataset_url'] = harvest_object.guid
- package_dict['extras']['eu_country'] = u'SE'
-
- return self._create_or_update_package(package_dict,harvest_object)
-
-import json
-from csv import DictReader
-class DataLondonGovUkHarvester(HarvesterBase):
- CATALOGUE_CSV_URL = "http://data.london.gov.uk/datafiles/datastore-catalogue.csv"
-
- def info(self):
- return {
- 'name': 'data_london_gov_uk',
- 'title': 'data.london.gov.uk',
- 'description': 'CSV Import from GLA Datastore'
- }
-
- def gather_stage(self, harvest_job):
- log.debug('In DataLondonGovUk gather_stage')
-
- csvfh = urllib2.urlopen(self.CATALOGUE_CSV_URL)
- csv = DictReader(csvfh)
- ids = []
- for row in csv:
- id = row.get('DRUPAL_NODE')
- row = dict([(k, v.decode('latin-1')) for k, v in row.items()])
- obj = HarvestObject(guid=id, job=harvest_job,
- content=json.dumps(row))
- obj.save()
- ids.append(obj.id)
- return ids
-
- def fetch_stage(self, harvest_object):
- return True
-
- def import_stage(self,harvest_object):
- if not harvest_object:
- log.error('No harvest object received')
- return False
-
- if harvest_object.content is None:
- self._save_object_error('Empty content for object %s' % harvest_object.id,harvest_object,'Import')
- return False
-
- try:
- row = json.loads(harvest_object.content)
- def csplit(txt):
- return [t.strip() for t in txt.split(",")]
-
- package_dict = {
- 'title': row['TITLE'],
- 'url': row['URL'],
- 'notes': row['LONGDESC'],
- 'author': row['AUTHOR_NAME'],
- 'maintainer': row['MAINTAINER'],
- 'maintainer_email': row['MAINTAINER_EMAIL'],
- 'tags': csplit(row['TAGS']),
- 'license_id': 'ukcrown',
- 'extras': {
- 'date_released': row['RELEASE_DATE'],
- 'categories': csplit(row['CATEGORIES']),
- 'geographical_granularity': row['GEOGRAPHY'],
- 'geographical_coverage': row['EXTENT'],
- 'temporal_granularity': row['UPDATE_FREQUENCY'],
- 'temporal_coverage': row['DATE_RANGE'],
- 'license_summary': row['LICENSE_SUMMARY'],
- 'license_details': row['license_details'],
- 'spatial_reference_system': row['spatial_ref'],
- 'harvest_dataset_url': row['DATASTORE_URL'],
- # Common extras
- 'harvest_catalogue_name': 'London Datastore',
- 'harvest_catalogue_url': 'http://data.london.gov.uk',
- 'eu_country': 'UK',
- 'eu_nuts1': 'UKI'
-
- },
- 'resources': []
- }
-
- def pkg_format(prefix, mime_type):
- if row.get(prefix + "_URL"):
- package_dict['resources'].append({
- 'url': row.get(prefix + "_URL"),
- 'format': mime_type,
- 'description': "%s version" % prefix.lower()
- })
-
- pkg_format('EXCEL', 'application/vnd.ms-excel')
- pkg_format('CSV', 'text/csv')
- pkg_format('TAB', 'text/tsv')
- pkg_format('XML', 'text/xml')
- pkg_format('GOOGLEDOCS', 'api/vnd.google-spreadsheet')
- pkg_format('JSON', 'application/json')
- pkg_format('SHP', 'application/octet-stream+esri')
- pkg_format('KML', 'application/vnd.google-earth.kml+xml')
- except Exception, e:
- log.exception(e)
- self._save_object_error('%r' % e, harvest_object, 'Import')
-
- package_dict['id'] = harvest_object.guid
- package_dict['name'] = self._gen_new_name(package_dict['title'])
- return self._create_or_update_package(package_dict, harvest_object)
-
-
-from lxml import html, etree
-from hashlib import sha1
-class DataWienGvAtHarvester(HarvesterBase):
- CATALOGUE_FEED_URL = "http://data.wien.gv.at/katalog/.indexR.xml"
-
- def info(self):
- return {
- 'name': 'data_wien_gv_at',
- 'title': 'Open Government Data Wien',
- 'description': 'CSV Import from GLA Datastore'
- }
-
- def gather_stage(self, harvest_job):
- log.debug('In DataWienGvAt gather_stage')
-
- doc = etree.parse(self.CATALOGUE_FEED_URL)
- ids = []
- for link in doc.findall("//item/link"):
- link = link.text
- id = sha1(link).hexdigest()
- obj = HarvestObject(guid=id, job=harvest_job, content=link)
- obj.save()
- ids.append(obj.id)
- return ids
-
- def fetch_stage(self, harvest_object):
- doc = html.parse(harvest_object.content)
- package_dict = {'extras': {'harvest_dataset_url': harvest_object.content},
- 'resources': []}
- package_dict['title'] = doc.findtext('//title').split(' | ')[0]
- if not doc.find('//table[@class="BDE-table-frame vie-ogd-table"]'):
- return False
- for meta in doc.findall("//meta"):
- key = meta.get('name')
- value = meta.get('content')
- if key is None or value is None:
- continue
- if key == 'DC.Creator':
- package_dict['author'] = value
- elif key == 'DC.date.created':
- package_dict['metadata_created'] = value
- elif key == 'DC.date.modified':
- package_dict['metadata_modified'] = value
- elif key == 'keywords':
- package_dict['tags'] = value.split(',')
- for row in doc.findall('//table[@class="BDE-table-frame vie-ogd-table"]//tr'):
- key = row.find('th/p').text
- elem = row.find('td')
- if key == 'Beschreibung':
- package_dict['notes'] = elem.xpath("string()")
- elif key == 'Bezugsebene':
- package_dict['extras']['geographic_coverage'] = elem.xpath("string()")
- elif key == 'Zeitraum':
- package_dict['extras']['temporal_coverage'] = elem.xpath("string()")
- elif key == 'Aktualisierung':
- package_dict['extras']['temporal_granularity'] = elem.xpath("string()")
- elif key == 'Kategorien':
- categories = elem.xpath("string()").split(',')
- package_dict['extras']['categories'] = [c.strip() for c in categories]
- elif key == 'Typ':
- package_dict['extras']['type'] = elem.xpath("string()")
- elif key == u'Attribute':
- elem.tag = 'span'
- package_dict['extras']['attributes'] = etree.tostring(elem)
- elif key == u'Datenqualität':
- package_dict['extras']['data_quality'] = elem.xpath("string()")
- elif key == 'Kontakt':
- package_dict['maintainer'] = elem.xpath("string()")
- elif key == 'Lizenz':
- if 'by/3.0/at/deed.de' in elem.findall('.//a')[0].get('href'):
- package_dict['license_id'] = 'cc-by'
- elif key == 'Datensatz':
- for li in elem.findall('.//li'):
- link = li.find('.//a').get('href')
- if li.find('.//abbr') is not None:
- res = {'description': li.xpath('string()'),
- 'url': link,
- 'format': li.find('.//abbr').text}
- package_dict['resources'].append(res)
- else:
- package_dict['url'] = link
-
- harvest_object.content = json.dumps(package_dict)
- harvest_object.save()
- return True
-
- def import_stage(self,harvest_object):
- if not harvest_object:
- log.error('No harvest object received')
- return False
-
- if harvest_object.content is None:
- self._save_object_error('Empty content for object %s' % harvest_object.id,harvest_object,'Import')
- return False
-
- try:
- package_dict = json.loads(harvest_object.content)
- package_dict['id'] = harvest_object.guid
- package_dict['name'] = self._gen_new_name(package_dict['title'])
-
- # Common extras
- package_dict['extras']['harvest_catalogue_name'] = u'Open Government Data Wien'
- package_dict['extras']['harvest_catalogue_url'] = u'http://data.wien.gv.at'
- package_dict['extras']['eu_country'] = u'AT'
- package_dict['extras']['eu_nuts2'] = u'AT13'
-
- return self._create_or_update_package(package_dict, harvest_object)
- except Exception, e:
- log.exception(e)
- self._save_object_error('%r' % e, harvest_object, 'Import')
-
-
-class OpendataParisFrHarvester(HarvesterBase):
- PREFIX_URL = "http://opendata.paris.fr/opendata/"
- CATALOGUE_INDEX_URL = "jsp/site/Portal.jsp?page_id=5"
-
- def info(self):
- return {
- 'name': 'opendata_paris_fr',
- 'title': 'Paris Open Data',
- 'description': 'Bienvenue sur ParisData, le site de la politique Open Data de la Ville de Paris.'
- }
-
- def gather_stage(self, harvest_job):
- log.debug('In OpendataParisFr gather_stage')
-
- doc = html.parse(self.PREFIX_URL + self.CATALOGUE_INDEX_URL)
- ids = []
- for link in doc.findall("//div[@class='animate download-portlet-element']/a"):
- link = link.get('href')
- if not "#comments" in link:
- id = sha1(link).hexdigest()
- obj = HarvestObject(guid=id, job=harvest_job, content=link)
- obj.save()
- ids.append(obj.id)
- return ids
-
- def fetch_stage(self, harvest_object):
- doc = html.parse(self.PREFIX_URL + harvest_object.content)
- package_dict = {'extras': {}, 'resources': [], 'tags': []}
- package_dict['title'] = doc.findtext('//h3[@class="fullpage-header"]')
- package_dict['author'] = doc.find('//meta[@name="author"]').get('content')
- package_dict['extras']['harvest_dataset_url'] = self.PREFIX_URL + harvest_object.content
- for p in doc.findall('//div[@id="content"]//p'):
- section = p.find('strong')
- if section is None:
- continue
- key = section.text.strip().encode('utf-8')
- value = section.tail.strip().encode('utf-8')
- if 'Mots' in key:
- package_dict['tags'] = p.findtext('.//span[@id="tags"]').split(',')
- elif 'Description' in key:
- package_dict['notes'] = value
- elif 'publication' in key:
- package_dict['metadata_created'] = value
- elif 'riode couverte par le jeu de don' in key:
- package_dict['extras']['temporal_coverage'] = value
- elif 'quence de mise' in key:
- package_dict['extras']['temporal_granularity'] = value
- elif 'Th' in key:
- package_dict['extras']['categories'] = value
-
- res = self.PREFIX_URL + doc.find('//a[@id="f1"]').get('href')
- package_dict['resources'].append({
- 'url': res,
- 'format': '',
- 'description': 'Telecharger'
- })
- package_dict['license_id'] = 'odc-odbl'
- harvest_object.content = json.dumps(package_dict)
- harvest_object.save()
- return True
-
- def import_stage(self,harvest_object):
- if not harvest_object:
- log.error('No harvest object received')
- return False
-
- if harvest_object.content is None:
- self._save_object_error('Empty content for object %s' % harvest_object.id,harvest_object,'Import')
- return False
-
- try:
- package_dict = json.loads(harvest_object.content)
- package_dict['id'] = harvest_object.guid
- package_dict['name'] = self._gen_new_name(package_dict['title'])
-
- # Common extras
- package_dict['extras']['harvest_catalogue_name'] = u'ParisData'
- package_dict['extras']['harvest_catalogue_url'] = u'http://opendata.paris.fr'
- package_dict['extras']['eu_country'] = u'FR'
- package_dict['extras']['eu_nuts3'] = u'FR101'
-
- return self._create_or_update_package(package_dict, harvest_object)
- except Exception, e:
- log.exception(e)
- self._save_object_error('%r' % e, harvest_object, 'Import')
-
-from lxml import etree
-class DigitaliserDkHarvester(HarvesterBase):
- API_ENDPOINT = "http://api.digitaliser.dk/rest/"
- NS = "{urn:oio:digitaliserdk:rest:1.0}"
- PSN = "{http://rep.oio.dk/ebxml/xml/schemas/dkcc/2003/02/13/}"
-
- def info(self):
- return {
- 'name': 'digitaliser_dk',
- 'title': 'Digitaliser.dk',
- 'description': 'Danish government data and document repository.',
- 'form_config_interface':'Text'
- }
-
- def gather_stage(self, harvest_job):
- log.debug('In Digitaliser.dk gather_stage')
-
- firstResult = 0
- maxResults = 1000
- ids = []
- while True:
- req = 'resources/search?query=&firstResult=%s&maxResults=%s' % \
- (firstResult, maxResults)
- doc = etree.parse(self.API_ENDPOINT + req)
- for handle in doc.findall(self.NS + "ResourceHandle"):
- link = handle.get('handleReference')
- id = sha1(link).hexdigest()
- obj = HarvestObject(guid=id, job=harvest_job, content=link)
- obj.save()
- ids.append(obj.id)
- firstResult += maxResults
- if firstResult > int(doc.getroot().get('totalResults')):
- break
- return ids
-
- def fetch_stage(self, harvest_object):
- doc = etree.parse(harvest_object.content)
- category = doc.findtext('//' + self.NS + 'ResourceCategoryHandle/' + self.NS + 'TitleText')
- if category != "Datakilde":
- return
- package_dict = {'extras': {}, 'resources': [], 'tags': []}
- package_dict['title'] = doc.findtext(self.NS + 'TitleText')
- package_dict['notes'] = doc.findtext(self.NS + 'BodyText')
- package_dict['author'] = doc.findtext(self.NS + \
- 'ResourceOwnerGroupHandle/' + self.NS + 'TitleText')
- package_dict['extras']['harvest_dataset_url'] = harvest_object.content
-
- package_dict['metadata_created'] = doc.findtext(self.NS + 'CreatedDateTime')
- package_dict['metadata_modified'] = doc.find(self.NS + 'PublishedState').get('publishedDateTime')
-
- responsible = doc.findtext(self.NS + 'ResponsibleReference')
- res_doc = etree.parse(responsible)
- package_dict['maintainer'] = res_doc.findtext('//' + self.PSN + 'PersonGivenName') + \
- " " + res_doc.findtext('//' + self.PSN + 'PersonSurnameName')
-
- package_dict['extras']['categories'] = []
- for tax_handle in doc.findall('//' + self.NS + 'TaxonomyNodeHandle'):
- package_dict['extras']['categories'].append(tax_handle.findtext(self.NS + 'TitleText'))
-
- for tag_handle in doc.findall('//' + self.NS + 'TagHandle'):
- package_dict['tags'].append(tag_handle.findtext(self.NS + 'LabelText'))
-
- ref_handle = doc.find('//' + self.NS + 'ReferenceHandle')
- if ref_handle:
- ref_doc = etree.parse(ref_handle.get('handleReference'))
- package_dict['url'] = ref_doc.getroot().get('url')
-
- for artefact in doc.findall('//' + self.NS + 'ArtefactHandle'):
- art_doc = etree.parse(artefact.get('handleReference'))
- package_dict['resources'].append({
- 'url': art_doc.getroot().get('url'),
- 'format': '',
- 'description': artefact.findtext(self.NS + 'TitleText')
- })
-
- #from pprint import pprint
- #pprint(package_dict)
- harvest_object.content = json.dumps(package_dict)
- harvest_object.save()
- return True
-
- def import_stage(self,harvest_object):
- if not harvest_object:
- log.error('No harvest object received')
- return False
-
- if harvest_object.content is None:
- self._save_object_error('Empty content for object %s' % harvest_object.id,harvest_object,'Import')
- return False
-
- try:
- package_dict = json.loads(harvest_object.content)
- package_dict['id'] = harvest_object.guid
- package_dict['name'] = self._gen_new_name(package_dict['title'])
-
- # Common extras
- package_dict['extras']['harvest_catalogue_name'] = u'Digitaliser.dk'
- package_dict['extras']['harvest_catalogue_url'] = u'http://digitaliser.dk'
- package_dict['extras']['eu_country'] = u'DK'
- package_dict['extras']['eu_nuts1'] = u'DK0'
-
- return self._create_or_update_package(package_dict, harvest_object)
- except Exception, e:
- log.exception(e)
- self._save_object_error('%r' % e, harvest_object, 'Import')
-
-
-
-
-
-
Repository URL: https://bitbucket.org/okfn/ckanext-pdeu/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
More information about the ckan-changes
mailing list