Commit 623bfdaa authored by abuddenberg's avatar abuddenberg
Browse files

Add support for 50 States metadata. Clean up a lot of hacks.

parent 699cb1be
......@@ -123,21 +123,22 @@ class Figure(GcisObject):
#TODO: Ordinal handling is unnecessarily complex
@figure_num.setter
def figure_num(self, value):
try:
chp, fig = value.split('.')
chp = int(chp)
fig = int(fig)
except ValueError:
print 'Invalid chapter/figure numbers: ' + value
chp = None
fig = None
self.ordinal = fig
#If we have an actual Chapter instance, populate it
if isinstance(self.chapter, Chapter):
self.chapter.number = chp
else:
self.chapter = chp
if value:
try:
chp, fig = value.split('.')
chp = int(chp)
fig = int(fig)
except ValueError:
print 'Invalid chapter/figure numbers: ' + value
chp = None
fig = None
self.ordinal = fig
#If we have an actual Chapter instance, populate it
if isinstance(self.chapter, Chapter):
self.chapter.number = chp
else:
self.chapter = chp
def as_json(self, indent=0, omit_fields=('images', 'chapter', 'kindred_figures', 'keywords')):
return super(Figure, self).as_json(omit_fields=omit_fields)
......@@ -277,6 +278,8 @@ class Dataset(GcisObject):
self._release_dt = parse(value).isoformat() if value else None
except TypeError:
self._release_dt = None
except ValueError:
self._release_dt = None
@property
def access_dt(self):
......@@ -289,6 +292,8 @@ class Dataset(GcisObject):
except TypeError:
# print "Problem with date: " + self.access_dt
self._access_dt = None
except ValueError:
self._access_dt = None
@property
def publication_year(self):
......
......@@ -26,9 +26,9 @@ def get_credentials():
def parse_title(graphic_title):
match = re.search('\w+\.\d+', graphic_title)
match = re.search('^(\d+[a-z]?)\.', graphic_title)
if match:
return match.group(0), graphic_title[match.end(0):].strip()
return match.group(1), graphic_title[match.end(0):].strip()
else:
return None, graphic_title
......@@ -36,19 +36,8 @@ def parse_title(graphic_title):
def populate_figure(fig_json):
f = Figure({})
try:
if fig_json['graphics_title'].startswith('ES'):
title_fields = fig_json['graphics_title'].split('. ')
title = ' '.join(title_fields[1:])
f.ordinal = re.search('\d+', title_fields[0]).group(0)
else:
figure_num, title = parse_title(fig_json['graphics_title'])
if figure_num and figure_num.startswith('TSD'):
f.ordinal = figure_num.split('.')[1]
else:
f.figure_num = figure_num if figure_num else None
figure_num, title = parse_title(fig_json['graphics_title'])
f.ordinal = figure_num if figure_num else None
f.title = title
f.identifier = fig_json['figure_id'] if fig_json['figure_id'] else re.sub('\W', '_', f.title).lower()
f.create_dt = fig_json['graphics_create_date'].strip()
......@@ -219,7 +208,7 @@ class SurveyClient:
figure_json = tier1_json['figure']
#It's not worth trying to translations on this data; it's too different
f = populate_figure(figure_json)
f.remote_path = survey_json[0]['filepath']
f.remote_path = survey_json[0]['filepath'].replace('sites/default/', 'system/')
f.local_path = join(self.local_download_dir, basename(f.remote_path)) if f.remote_path else None
if 'copyright' in survey_json[0]:
......
......@@ -9,7 +9,9 @@ DATASET_IDS = {
'ArboNet': 'cdc-arbonet',
'U.S. Natural Hazard Statistics': 'noaa-nws-us-natural-hazard-statistics',
'Billion-Dollar Weather and Climate Disasters': 'noaa-ncdc-billion-dollar-weather-climate-disasters',
'ESRI USA10 dataset (ArcGIS version 10.0)': 'esri-arcgis-v10-0'
'ESRI USA10 dataset (ArcGIS version 10.0)': 'esri-arcgis-v10-0',
'nClimDiv': 'noaa-ncdc-cag-us-temperature-nclimdiv',
'Global Historical Climatology Network (GHCN) Daily': 'noaa-ncdc-ghcn-daily'
}
COPYRIGHT_TRANSLATIONS = {
......
......@@ -31,9 +31,9 @@ def parse_creators(field):
first_name, last_name = name_split[0], name_split[-1]
org_name = rest[0] if len(rest) > 0 else None
contributor = Contributor({}, hints=trans.CONTRIB_ROLES)
contributor = Contributor({})
contributor.person = Person({'first_name': first_name, 'last_name': last_name})
contributor.organization = Organization({'name': org_name}, known_ids=trans.ORG_IDS)
contributor.organization = Organization({'name': org_name})
return contributor
......@@ -92,7 +92,7 @@ class WebformClient:
#Add provenance information (wasDerivedFrom parent)
if 'what_type_of_source_provided_this_figure' in figure_json and figure_json[
'what_type_of_source_provided_this_figure'] == 'published_source':
f.add_parent(Parent(deepcopy(f.original), trans=trans.PARENT_TRANSLATIONS, pubtype_map=trans.PARENT_PUBTYPE_MAP, search_hints=trans.PARENT_SEARCH_HINTS))
f.add_parent(Parent(deepcopy(f.original), trans=trans.PARENT_TRANSLATIONS, pubtype_map=trans.PARENT_PUBTYPE_MAP))
if 'images' in webform_json[webform_nid]:
for img_idx, image in enumerate(webform_json[webform_nid]['images']):
......@@ -134,8 +134,8 @@ class WebformClient:
activity_json['identifier'] = '-'.join((image_obj.identifier.split('-')[0], dataset.identifier, 'process'))
dataset.activity = Activity(activity_json, trans=trans.ACT_TRANSLATIONS)
#TODO: Extract DOIs from citation
image_obj.datasets.append(dataset)
# TODO: Extract DOIs from citation
# image_obj.datasets.append(dataset)
f.images.append(image_obj)
#If download_images arg is set, attempt to download all images for this figure
......
This diff is collapsed.
__author__ = 'abuddenberg'
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from gcis_clients import GcisClient, SurveyClient, survey_token, gcis_dev_auth, gcis_stage_auth
from gcis_clients.domain import Report, Chapter
from sync_utils import realize_parents, realize_contributors
from states import sync_metadata_tree
import pickle
import sys
import re
# gcis = GcisClient('https://data-stage.globalchange.gov', *gcis_stage_auth)
# gcis = GcisClient('https://data.globalchange.gov', *gcis_stage_auth)
surveys = SurveyClient('https://state-resources.cicsnc.org', survey_token)
def main():
print(gcis.test_login())
for report_id in sync_metadata_tree:
for chapter_id in sync_metadata_tree[report_id]:
for survey_url, figure_id, figure_num in sync_metadata_tree[report_id][chapter_id]:
figure, datasets = surveys.get_survey(survey_url, do_download=True)
print(survey_url)
print(figure, datasets)
realize_parents(gcis, figure.parents)
realize_contributors(gcis, figure.contributors)
print('Contributors: ', figure.contributors)
print('Parents: ', figure.parents)
# gcis_fig = gcis.get_figure(report_id, figure_id, chapter_id=chapter_id)
def gen_survey_list():
# with open('survey_list.pk', 'wb') as out:
# pickle.dump(gen_survey_list(), out)
# surveys = pickle.load(open('survey_list.pk'))
#
# for st in sync_metadata_tree['noaa-led-state-summaries-2016']:
# print(st)
# for f in sorted(surveys[st], key=lambda x: x[1]):
# print("('{0}', '{1}', '{2}'),".format(f[0], f[2], f[1]))
# print('')
realized_list = {}
survey_list = surveys.get_list()
for i, survey in enumerate(survey_list):
url = survey['url']
match = re.match('group/([a-z-]+)', survey['node_title'])
chapter = match.group(1) if match else ''
print('Processing: {b}{url} ({i}/{total})'.format(b=surveys.base_url, url=url, i=i + 1, total=len(survey_list)))
s, ds = surveys.get_survey(url)
if s:
print(s.identifier)
print(chapter, s.ordinal, s.title)
realized_list.setdefault(chapter, []).append((url, s.ordinal, s.identifier, s.title))
print('')
return realized_list
def create_nlss_report():
nlss = Report({
'identifier': 'noaa-led-state-summaries-2016',
'report_type_identifier': 'report',
'title': 'NOAA-led State Summaries 2016',
'url': 'https://statesummaries.cicsnc.org/',
'publication_year': '2016',
'contact_email': ''
})
chapters = [(id, i + 1, ' '.join([w.capitalize() for w in id.split('-')])) for i, id in enumerate(sync_metadata_tree['noaa-led-state-summaries-2016'])]
print(gcis.create_report(nlss))
for id, num, title in chapters:
ch = Chapter({
'identifier': id,
'number': num,
'title': title,
'report_identifier': nlss.identifier
})
print(gcis.create_chapter(nlss.identifier, ch))
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment