webform_client.py 7.86 KB
Newer Older
1
2
3
#!/usr/bin/python

import urllib
abuddenberg's avatar
abuddenberg committed
4
import re
5
from os.path import join
6
import getpass
7
import requests
8
from dateutil.parser import parse
9
from copy import deepcopy
abuddenberg's avatar
abuddenberg committed
10

11
from domain import Figure, Image, Dataset, Activity, Contributor, Person, Organization, Parent
abuddenberg's avatar
abuddenberg committed
12
13
14
15
16


def sanitized(pattern):
    def dec(fn):
        def wrapped(*args, **kwargs):
17
            if re.match(pattern, urllib.quote(args[1])):
abuddenberg's avatar
abuddenberg committed
18
19
                return fn(*args, **kwargs)
            else:
20
                print 'Rejected: ', args[1]
abuddenberg's avatar
abuddenberg committed
21
22
23
24
        return wrapped
    return dec


abuddenberg's avatar
abuddenberg committed
25
26
27
28
29
30
31
32
def parse_creators(field):
    s = field.split(',')
    name, rest = s[0], s[1:]

    name_split = name.split()
    first_name, last_name = name_split[0], name_split[-1]
    org_name = rest[0] if len(rest) > 0 else None

33
34
35
36
37
    contributor = Contributor({})
    contributor.person = Person({'first_name': first_name, 'last_name': last_name})
    contributor.organization = Organization({'name': org_name})

    return contributor
abuddenberg's avatar
abuddenberg committed
38

39

40
41
42
43
44
45
46
47
48
49
50
def get_credentials():
    #First check our magic enviroment variable (WEBFORM_TOKEN)
    from gcis_clients import webform_token

    if webform_token is not None:
        return webform_token

    else:
        return getpass.getpass('Webform token: ')


51
class WebformClient:
52

53
    def __init__(self, url, token, local_image_dir=None, remote_dir='/system/files/'):
54
        self.base_url = url
55
56
57
58
59

        #If token was not provided, obtain it
        if token is None:
            token = get_credentials()

60
        self.token = token
61
62
63
64
65
66

        if local_image_dir:
            self.images_dir = local_image_dir
        else:
            from gcis_clients import default_image_dir
            self.images_dir = default_image_dir()
67
        self.remote_image_dir = remote_dir
68

69
70
71
    def get_list(self):
        url = '{b}/metadata/list?token={t}'.format(b=self.base_url, t=self.token)
        return requests.get(url).json()
72

73
74
    def get_all_webforms(self):
        pass
75
76


77
    @sanitized('^/metadata/figures/\d+$')
78
    def get_webform(self, fig_url, download_images=False):
79
        full_url = '{b}{url}?token={t}'.format(b=self.base_url, url=fig_url, t=self.token)
80
        webform_json = requests.get(full_url).json()
81

82
        #TODO: refactor the service so this isn't necessary
83
        webform_nid = webform_json.keys()[0]
abuddenberg's avatar
abuddenberg committed
84
85
86
87
88
89
        figure_json = webform_json[webform_nid]['figure'][0]
        f = Figure(figure_json)

        #Add contributor info
        if 'list_the_creator_of_the_figure' in figure_json:
            f.add_contributor(parse_creators(figure_json['list_the_creator_of_the_figure']))
90

91
92
93
94
95
        #Add provenance information (wasDerivedFrom parent)
        if 'what_type_of_source_provided_this_figure' in figure_json and figure_json[
            'what_type_of_source_provided_this_figure'] == 'published_source':
            f.add_parent(Parent(deepcopy(f.original)))

96
97
98
99
100
        if 'images' in webform_json[webform_nid]:
            for img_idx, image in enumerate(webform_json[webform_nid]['images']):
                image_obj = Image(image, local_path=self.get_local_image_path(image),
                                  remote_path=self.get_remote_image_path(image))

abuddenberg's avatar
abuddenberg committed
101
                #Add contributor info
102
103
                if 'list_the_creator_of_the_image' in image:
                    image_obj.add_contributor(parse_creators(image['list_the_creator_of_the_image']))
abuddenberg's avatar
abuddenberg committed
104

105
106
107
108
109
110
                #TODO: this just keeps getting worse
                if 'datasources' in webform_json[webform_nid]['images'][img_idx]:
                    for dataset_json in webform_json[webform_nid]['images'][img_idx]['datasources']:
                        dataset = Dataset(dataset_json)

                        #Commence the hacks
111
112
113
114
115
116
117
118
119
120
121
122
                        try:
                            dataset.temporal_extent = ' '.join(
                                [parse(dataset_json[field]).isoformat() for field in ['start_time', 'end_time']]
                            )
                        except TypeError, e:
                            print 'Problem with start/end time: ', fig_url, f.title, e
                            print dataset_json['start_time'], dataset_json['end_time']
                            dataset.temporal_extent = None
                        except ValueError, e:
                            print 'Problem with start/end time: ', fig_url, f.title, e
                            print dataset_json['start_time'], dataset_json['end_time']
                            dataset.temporal_extent = None
123

124
125
126
                        dataset.spatial_extent = ' '.join(['{k}: {v};'.format(k=key, v=dataset_json[key]) for key in
                                                           ['maximum_latitude', 'minimum_latitude', 'maximum_longitude',
                                                            'minimum_longitude']])
abuddenberg's avatar
abuddenberg committed
127
128
129
130
131
132

                        #Filter overlapping Dataset keys out
                        activity_json = {k: dataset_json[k] for k in dataset_json if
                                         k not in ['href', 'uri', 'identifier', 'start_time', 'end_time']}

                        #Add synthetic identifier
133
                        activity_json['identifier'] = '-'.join((image_obj.identifier.split('-')[0], dataset.identifier, 'process'))
abuddenberg's avatar
abuddenberg committed
134
135
                        dataset.activity = Activity(activity_json)

136
137
138
139
                        #TODO: Extract DOIs from citation
                        image_obj.datasets.append(dataset)

                f.images.append(image_obj)
140
141
142
            #If download_images arg is set, attempt to download all images for this figure
            if download_images:
                self.download_all_images(f)
143
        return f
abuddenberg's avatar
abuddenberg committed
144

145
146
147
148
149
150
151
152
153
154
    def get_remote_image_path(self, image_json):
        filename_key = 'what_is_the_file_name_extension_of_the_image'
        if image_json not in (None, '') and image_json[filename_key] not in (None, ''):
            return self.remote_image_dir + image_json[filename_key].lower()

    def get_local_image_path(self, image_json):
        filename_key = 'what_is_the_file_name_extension_of_the_image'
        if image_json not in (None, '') and image_json[filename_key] not in (None, ''):
            return join(self.images_dir, image_json[filename_key].lower())

155
156
    # def local_image_exists(self, filename):
    #     return exists(join(self.images_dir, filename))
157
158
159
160

    def remote_image_exists(self, path):
        url = '{b}{path}?token={t}'.format(b=self.base_url, path=path, t=self.token)
        resp = requests.head(url)
161
        # print resp.status_code, resp.text
162
163
        return True if resp.status_code == 200 else False

164
165
    def download_image(self, image):
        url = '{b}{path}?token={t}'.format(b=self.base_url, path=image.remote_path, t=self.token)
166
        resp = requests.get(url, stream=True)
abuddenberg's avatar
abuddenberg committed
167

168
        if resp.status_code == 200:
169
            filepath = join(self.images_dir, image.remote_path.split('/')[-1])
170
            with open(filepath, 'wb') as image_out:
171
172
                for bytes in resp.iter_content(chunk_size=4096):
                    image_out.write(bytes)
abuddenberg's avatar
abuddenberg committed
173

174
            return filepath
175
176
        elif resp.status_code == 404:
            raise Exception('Image not found: {u}'.format(u=url))
177
        else:
178
            raise Exception(resp.status_code)
179

180
181
    def download_all_images(self, figure):
        for image in figure.images:
182
183
            self.download_image(image)

abuddenberg's avatar
abuddenberg committed
184
185
    def get_aggregated_datasets(self):
        dataset_map = {}
186

abuddenberg's avatar
abuddenberg committed
187
188
189
190
191
192
193
194
195
196
197
198
199
200
        for item in self.get_list():
            webform_url = item['url']

            f = self.get_webform(webform_url)

            #aggregate datasets
            for image in f.images:
                for dataset in image.datasets:
                    if dataset.identifier not in dataset_map:
                        dataset_map[dataset.identifier] = dataset
                    else:
                        dataset_map[dataset.identifier].merge(dataset)
                        dataset_map[dataset.identifier].activity.merge(dataset.activity)

201
                    # print webform_url, dataset.identifier
202
        return dataset_map