Commit 623bfdaa authored by abuddenberg's avatar abuddenberg
Browse files

Add support for 50 States metadata. Clean up a lot of hacks.

parent 699cb1be
...@@ -123,21 +123,22 @@ class Figure(GcisObject): ...@@ -123,21 +123,22 @@ class Figure(GcisObject):
#TODO: Ordinal handling is unnecessarily complex #TODO: Ordinal handling is unnecessarily complex
@figure_num.setter @figure_num.setter
def figure_num(self, value): def figure_num(self, value):
try: if value:
chp, fig = value.split('.') try:
chp = int(chp) chp, fig = value.split('.')
fig = int(fig) chp = int(chp)
except ValueError: fig = int(fig)
print 'Invalid chapter/figure numbers: ' + value except ValueError:
chp = None print 'Invalid chapter/figure numbers: ' + value
fig = None chp = None
self.ordinal = fig fig = None
self.ordinal = fig
#If we have an actual Chapter instance, populate it
if isinstance(self.chapter, Chapter): #If we have an actual Chapter instance, populate it
self.chapter.number = chp if isinstance(self.chapter, Chapter):
else: self.chapter.number = chp
self.chapter = chp else:
self.chapter = chp
def as_json(self, indent=0, omit_fields=('images', 'chapter', 'kindred_figures', 'keywords')): def as_json(self, indent=0, omit_fields=('images', 'chapter', 'kindred_figures', 'keywords')):
return super(Figure, self).as_json(omit_fields=omit_fields) return super(Figure, self).as_json(omit_fields=omit_fields)
...@@ -277,6 +278,8 @@ class Dataset(GcisObject): ...@@ -277,6 +278,8 @@ class Dataset(GcisObject):
self._release_dt = parse(value).isoformat() if value else None self._release_dt = parse(value).isoformat() if value else None
except TypeError: except TypeError:
self._release_dt = None self._release_dt = None
except ValueError:
self._release_dt = None
@property @property
def access_dt(self): def access_dt(self):
...@@ -289,6 +292,8 @@ class Dataset(GcisObject): ...@@ -289,6 +292,8 @@ class Dataset(GcisObject):
except TypeError: except TypeError:
# print "Problem with date: " + self.access_dt # print "Problem with date: " + self.access_dt
self._access_dt = None self._access_dt = None
except ValueError:
self._access_dt = None
@property @property
def publication_year(self): def publication_year(self):
......
...@@ -26,9 +26,9 @@ def get_credentials(): ...@@ -26,9 +26,9 @@ def get_credentials():
def parse_title(graphic_title): def parse_title(graphic_title):
match = re.search('\w+\.\d+', graphic_title) match = re.search('^(\d+[a-z]?)\.', graphic_title)
if match: if match:
return match.group(0), graphic_title[match.end(0):].strip() return match.group(1), graphic_title[match.end(0):].strip()
else: else:
return None, graphic_title return None, graphic_title
...@@ -36,19 +36,8 @@ def parse_title(graphic_title): ...@@ -36,19 +36,8 @@ def parse_title(graphic_title):
def populate_figure(fig_json): def populate_figure(fig_json):
f = Figure({}) f = Figure({})
try: try:
if fig_json['graphics_title'].startswith('ES'): figure_num, title = parse_title(fig_json['graphics_title'])
title_fields = fig_json['graphics_title'].split('. ') f.ordinal = figure_num if figure_num else None
title = ' '.join(title_fields[1:])
f.ordinal = re.search('\d+', title_fields[0]).group(0)
else:
figure_num, title = parse_title(fig_json['graphics_title'])
if figure_num and figure_num.startswith('TSD'):
f.ordinal = figure_num.split('.')[1]
else:
f.figure_num = figure_num if figure_num else None
f.title = title f.title = title
f.identifier = fig_json['figure_id'] if fig_json['figure_id'] else re.sub('\W', '_', f.title).lower() f.identifier = fig_json['figure_id'] if fig_json['figure_id'] else re.sub('\W', '_', f.title).lower()
f.create_dt = fig_json['graphics_create_date'].strip() f.create_dt = fig_json['graphics_create_date'].strip()
...@@ -219,7 +208,7 @@ class SurveyClient: ...@@ -219,7 +208,7 @@ class SurveyClient:
figure_json = tier1_json['figure'] figure_json = tier1_json['figure']
#It's not worth trying to translations on this data; it's too different #It's not worth trying to translations on this data; it's too different
f = populate_figure(figure_json) f = populate_figure(figure_json)
f.remote_path = survey_json[0]['filepath'] f.remote_path = survey_json[0]['filepath'].replace('sites/default/', 'system/')
f.local_path = join(self.local_download_dir, basename(f.remote_path)) if f.remote_path else None f.local_path = join(self.local_download_dir, basename(f.remote_path)) if f.remote_path else None
if 'copyright' in survey_json[0]: if 'copyright' in survey_json[0]:
......
...@@ -9,7 +9,9 @@ DATASET_IDS = { ...@@ -9,7 +9,9 @@ DATASET_IDS = {
'ArboNet': 'cdc-arbonet', 'ArboNet': 'cdc-arbonet',
'U.S. Natural Hazard Statistics': 'noaa-nws-us-natural-hazard-statistics', 'U.S. Natural Hazard Statistics': 'noaa-nws-us-natural-hazard-statistics',
'Billion-Dollar Weather and Climate Disasters': 'noaa-ncdc-billion-dollar-weather-climate-disasters', 'Billion-Dollar Weather and Climate Disasters': 'noaa-ncdc-billion-dollar-weather-climate-disasters',
'ESRI USA10 dataset (ArcGIS version 10.0)': 'esri-arcgis-v10-0' 'ESRI USA10 dataset (ArcGIS version 10.0)': 'esri-arcgis-v10-0',
'nClimDiv': 'noaa-ncdc-cag-us-temperature-nclimdiv',
'Global Historical Climatology Network (GHCN) Daily': 'noaa-ncdc-ghcn-daily'
} }
COPYRIGHT_TRANSLATIONS = { COPYRIGHT_TRANSLATIONS = {
......
...@@ -31,9 +31,9 @@ def parse_creators(field): ...@@ -31,9 +31,9 @@ def parse_creators(field):
first_name, last_name = name_split[0], name_split[-1] first_name, last_name = name_split[0], name_split[-1]
org_name = rest[0] if len(rest) > 0 else None org_name = rest[0] if len(rest) > 0 else None
contributor = Contributor({}, hints=trans.CONTRIB_ROLES) contributor = Contributor({})
contributor.person = Person({'first_name': first_name, 'last_name': last_name}) contributor.person = Person({'first_name': first_name, 'last_name': last_name})
contributor.organization = Organization({'name': org_name}, known_ids=trans.ORG_IDS) contributor.organization = Organization({'name': org_name})
return contributor return contributor
...@@ -92,7 +92,7 @@ class WebformClient: ...@@ -92,7 +92,7 @@ class WebformClient:
#Add provenance information (wasDerivedFrom parent) #Add provenance information (wasDerivedFrom parent)
if 'what_type_of_source_provided_this_figure' in figure_json and figure_json[ if 'what_type_of_source_provided_this_figure' in figure_json and figure_json[
'what_type_of_source_provided_this_figure'] == 'published_source': 'what_type_of_source_provided_this_figure'] == 'published_source':
f.add_parent(Parent(deepcopy(f.original), trans=trans.PARENT_TRANSLATIONS, pubtype_map=trans.PARENT_PUBTYPE_MAP, search_hints=trans.PARENT_SEARCH_HINTS)) f.add_parent(Parent(deepcopy(f.original), trans=trans.PARENT_TRANSLATIONS, pubtype_map=trans.PARENT_PUBTYPE_MAP))
if 'images' in webform_json[webform_nid]: if 'images' in webform_json[webform_nid]:
for img_idx, image in enumerate(webform_json[webform_nid]['images']): for img_idx, image in enumerate(webform_json[webform_nid]['images']):
...@@ -134,8 +134,8 @@ class WebformClient: ...@@ -134,8 +134,8 @@ class WebformClient:
activity_json['identifier'] = '-'.join((image_obj.identifier.split('-')[0], dataset.identifier, 'process')) activity_json['identifier'] = '-'.join((image_obj.identifier.split('-')[0], dataset.identifier, 'process'))
dataset.activity = Activity(activity_json, trans=trans.ACT_TRANSLATIONS) dataset.activity = Activity(activity_json, trans=trans.ACT_TRANSLATIONS)
#TODO: Extract DOIs from citation # TODO: Extract DOIs from citation
image_obj.datasets.append(dataset) # image_obj.datasets.append(dataset)
f.images.append(image_obj) f.images.append(image_obj)
#If download_images arg is set, attempt to download all images for this figure #If download_images arg is set, attempt to download all images for this figure
......
This diff is collapsed.
__author__ = 'abuddenberg'
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from gcis_clients import GcisClient, SurveyClient, survey_token, gcis_dev_auth, gcis_stage_auth
from gcis_clients.domain import Report, Chapter
from sync_utils import realize_parents, realize_contributors
from states import sync_metadata_tree
import pickle
import sys
import re
# gcis = GcisClient('https://data-stage.globalchange.gov', *gcis_stage_auth)
# gcis = GcisClient('https://data.globalchange.gov', *gcis_stage_auth)
surveys = SurveyClient('https://state-resources.cicsnc.org', survey_token)
def main():
print(gcis.test_login())
for report_id in sync_metadata_tree:
for chapter_id in sync_metadata_tree[report_id]:
for survey_url, figure_id, figure_num in sync_metadata_tree[report_id][chapter_id]:
figure, datasets = surveys.get_survey(survey_url, do_download=True)
print(survey_url)
print(figure, datasets)
realize_parents(gcis, figure.parents)
realize_contributors(gcis, figure.contributors)
print('Contributors: ', figure.contributors)
print('Parents: ', figure.parents)
# gcis_fig = gcis.get_figure(report_id, figure_id, chapter_id=chapter_id)
def gen_survey_list():
# with open('survey_list.pk', 'wb') as out:
# pickle.dump(gen_survey_list(), out)
# surveys = pickle.load(open('survey_list.pk'))
#
# for st in sync_metadata_tree['noaa-led-state-summaries-2016']:
# print(st)
# for f in sorted(surveys[st], key=lambda x: x[1]):
# print("('{0}', '{1}', '{2}'),".format(f[0], f[2], f[1]))
# print('')
realized_list = {}
survey_list = surveys.get_list()
for i, survey in enumerate(survey_list):
url = survey['url']
match = re.match('group/([a-z-]+)', survey['node_title'])
chapter = match.group(1) if match else ''
print('Processing: {b}{url} ({i}/{total})'.format(b=surveys.base_url, url=url, i=i + 1, total=len(survey_list)))
s, ds = surveys.get_survey(url)
if s:
print(s.identifier)
print(chapter, s.ordinal, s.title)
realized_list.setdefault(chapter, []).append((url, s.ordinal, s.identifier, s.title))
print('')
return realized_list
def create_nlss_report():
nlss = Report({
'identifier': 'noaa-led-state-summaries-2016',
'report_type_identifier': 'report',
'title': 'NOAA-led State Summaries 2016',
'url': 'https://statesummaries.cicsnc.org/',
'publication_year': '2016',
'contact_email': ''
})
chapters = [(id, i + 1, ' '.join([w.capitalize() for w in id.split('-')])) for i, id in enumerate(sync_metadata_tree['noaa-led-state-summaries-2016'])]
print(gcis.create_report(nlss))
for id, num, title in chapters:
ch = Chapter({
'identifier': id,
'number': num,
'title': title,
'report_identifier': nlss.identifier
})
print(gcis.create_chapter(nlss.identifier, ch))
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment