webform_client.py 7.51 KB
Newer Older
1
2
3
#!/usr/bin/python

import urllib
abuddenberg's avatar
abuddenberg committed
4
import re
5
from os.path import join
6
import getpass
7
import requests
8
from dateutil.parser import parse
abuddenberg's avatar
abuddenberg committed
9

abuddenberg's avatar
abuddenberg committed
10
from domain import Figure, Image, Dataset, Activity, Contributor, Person, Organization
abuddenberg's avatar
abuddenberg committed
11
12
13
14
15


def sanitized(pattern):
    def dec(fn):
        def wrapped(*args, **kwargs):
16
            if re.match(pattern, urllib.quote(args[1])):
abuddenberg's avatar
abuddenberg committed
17
18
                return fn(*args, **kwargs)
            else:
19
                print 'Rejected: ', args[1]
abuddenberg's avatar
abuddenberg committed
20
21
22
23
        return wrapped
    return dec


abuddenberg's avatar
abuddenberg committed
24
25
26
27
28
29
30
31
def parse_creators(field):
    s = field.split(',')
    name, rest = s[0], s[1:]

    name_split = name.split()
    first_name, last_name = name_split[0], name_split[-1]
    org_name = rest[0] if len(rest) > 0 else None

32
33
34
35
36
    contributor = Contributor({})
    contributor.person = Person({'first_name': first_name, 'last_name': last_name})
    contributor.organization = Organization({'name': org_name})

    return contributor
abuddenberg's avatar
abuddenberg committed
37

38

39
40
41
42
43
44
45
46
47
48
49
def get_credentials():
    #First check our magic enviroment variable (WEBFORM_TOKEN)
    from gcis_clients import webform_token

    if webform_token is not None:
        return webform_token

    else:
        return getpass.getpass('Webform token: ')


50
class WebformClient:
51

52
    def __init__(self, url, token, local_image_dir=None, remote_dir='/system/files/'):
53
        self.base_url = url
54
55
56
57
58

        #If token was not provided, obtain it
        if token is None:
            token = get_credentials()

59
        self.token = token
60
61
62
63
64
65

        if local_image_dir:
            self.images_dir = local_image_dir
        else:
            from gcis_clients import default_image_dir
            self.images_dir = default_image_dir()
66
        self.remote_image_dir = remote_dir
67

68
69
70
    def get_list(self):
        url = '{b}/metadata/list?token={t}'.format(b=self.base_url, t=self.token)
        return requests.get(url).json()
71

72
73
    def get_all_webforms(self):
        pass
74
75


76
    @sanitized('^/metadata/figures/\d+$')
77
    def get_webform(self, fig_url, download_images=False):
78
        full_url = '{b}{url}?token={t}'.format(b=self.base_url, url=fig_url, t=self.token)
79
        webform_json = requests.get(full_url).json()
80

81
        #TODO: refactor the service so this isn't necessary
82
        webform_nid = webform_json.keys()[0]
abuddenberg's avatar
abuddenberg committed
83
84
85
86
87
88
        figure_json = webform_json[webform_nid]['figure'][0]
        f = Figure(figure_json)

        #Add contributor info
        if 'list_the_creator_of_the_figure' in figure_json:
            f.add_contributor(parse_creators(figure_json['list_the_creator_of_the_figure']))
89
90
91
92
93
94

        if 'images' in webform_json[webform_nid]:
            for img_idx, image in enumerate(webform_json[webform_nid]['images']):
                image_obj = Image(image, local_path=self.get_local_image_path(image),
                                  remote_path=self.get_remote_image_path(image))

abuddenberg's avatar
abuddenberg committed
95
                #Add contributor info
96
97
                if 'list_the_creator_of_the_image' in image:
                    image_obj.add_contributor(parse_creators(image['list_the_creator_of_the_image']))
abuddenberg's avatar
abuddenberg committed
98

99
100
101
102
103
104
                #TODO: this just keeps getting worse
                if 'datasources' in webform_json[webform_nid]['images'][img_idx]:
                    for dataset_json in webform_json[webform_nid]['images'][img_idx]['datasources']:
                        dataset = Dataset(dataset_json)

                        #Commence the hacks
105
106
107
108
109
110
111
112
113
114
115
116
                        try:
                            dataset.temporal_extent = ' '.join(
                                [parse(dataset_json[field]).isoformat() for field in ['start_time', 'end_time']]
                            )
                        except TypeError, e:
                            print 'Problem with start/end time: ', fig_url, f.title, e
                            print dataset_json['start_time'], dataset_json['end_time']
                            dataset.temporal_extent = None
                        except ValueError, e:
                            print 'Problem with start/end time: ', fig_url, f.title, e
                            print dataset_json['start_time'], dataset_json['end_time']
                            dataset.temporal_extent = None
117

118
119
120
                        dataset.spatial_extent = ' '.join(['{k}: {v};'.format(k=key, v=dataset_json[key]) for key in
                                                           ['maximum_latitude', 'minimum_latitude', 'maximum_longitude',
                                                            'minimum_longitude']])
abuddenberg's avatar
abuddenberg committed
121
122
123
124
125
126

                        #Filter overlapping Dataset keys out
                        activity_json = {k: dataset_json[k] for k in dataset_json if
                                         k not in ['href', 'uri', 'identifier', 'start_time', 'end_time']}

                        #Add synthetic identifier
127
                        activity_json['identifier'] = '-'.join((image_obj.identifier.split('-')[0], dataset.identifier, 'process'))
abuddenberg's avatar
abuddenberg committed
128
129
                        dataset.activity = Activity(activity_json)

130
131
132
133
                        #TODO: Extract DOIs from citation
                        image_obj.datasets.append(dataset)

                f.images.append(image_obj)
134
135
136
            #If download_images arg is set, attempt to download all images for this figure
            if download_images:
                self.download_all_images(f)
137
        return f
abuddenberg's avatar
abuddenberg committed
138

139
140
141
142
143
144
145
146
147
148
    def get_remote_image_path(self, image_json):
        filename_key = 'what_is_the_file_name_extension_of_the_image'
        if image_json not in (None, '') and image_json[filename_key] not in (None, ''):
            return self.remote_image_dir + image_json[filename_key].lower()

    def get_local_image_path(self, image_json):
        filename_key = 'what_is_the_file_name_extension_of_the_image'
        if image_json not in (None, '') and image_json[filename_key] not in (None, ''):
            return join(self.images_dir, image_json[filename_key].lower())

149
150
    # def local_image_exists(self, filename):
    #     return exists(join(self.images_dir, filename))
151
152
153
154

    def remote_image_exists(self, path):
        url = '{b}{path}?token={t}'.format(b=self.base_url, path=path, t=self.token)
        resp = requests.head(url)
155
        # print resp.status_code, resp.text
156
157
        return True if resp.status_code == 200 else False

158
159
    def download_image(self, image):
        url = '{b}{path}?token={t}'.format(b=self.base_url, path=image.remote_path, t=self.token)
160
        resp = requests.get(url, stream=True)
abuddenberg's avatar
abuddenberg committed
161

162
        if resp.status_code == 200:
163
            filepath = join(self.images_dir, image.remote_path.split('/')[-1])
164
            with open(filepath, 'wb') as image_out:
165
166
                for bytes in resp.iter_content(chunk_size=4096):
                    image_out.write(bytes)
abuddenberg's avatar
abuddenberg committed
167

168
            return filepath
169
170
        elif resp.status_code == 404:
            raise Exception('Image not found: {u}'.format(u=url))
171
        else:
172
            raise Exception(resp.status_code)
173

174
175
    def download_all_images(self, figure):
        for image in figure.images:
176
177
            self.download_image(image)

abuddenberg's avatar
abuddenberg committed
178
179
    def get_aggregated_datasets(self):
        dataset_map = {}
180

abuddenberg's avatar
abuddenberg committed
181
182
183
184
185
186
187
188
189
190
191
192
193
194
        for item in self.get_list():
            webform_url = item['url']

            f = self.get_webform(webform_url)

            #aggregate datasets
            for image in f.images:
                for dataset in image.datasets:
                    if dataset.identifier not in dataset_map:
                        dataset_map[dataset.identifier] = dataset
                    else:
                        dataset_map[dataset.identifier].merge(dataset)
                        dataset_map[dataset.identifier].activity.merge(dataset.activity)

195
        return dataset_map.values()