Class: S3QueryService

Inherits:
Object
  • Object
show all
Defined in:
app/services/s3_query_service.rb

Overview

A service to query an S3 bucket for information about a given data set rubocop:disable Metrics/ClassLength

Constant Summary collapse

PRECURATION =
"precuration"
POSTCURATION =
"postcuration"
PRESERVATION =
"preservation"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(model, mode = "precuration") ⇒ S3QueryService

Returns a new instance of S3QueryService.

Examples:

S3QueryService.new(Work.find(1), “precuration”)

Parameters:

  • model (Work)
  • mode (String) (defaults to: "precuration")

    Valid values are “precuration”, “postcuration”, “preservation”. This value controlls the AWS S3 bucket used to access the files.



37
38
39
40
41
42
43
44
# File 'app/services/s3_query_service.rb', line 37

def initialize(model, mode = "precuration")
  @model = model
  @doi = model.doi
  @mode = mode
  @part_size = 5_368_709_120 # 5GB is the maximum part size for AWS
  @last_response = nil
  @s3_responses = {}
end

Instance Attribute Details

#last_responseObject (readonly)

Returns the value of attribute last_response.



30
31
32
# File 'app/services/s3_query_service.rb', line 30

def last_response
  @last_response
end

#modelObject (readonly)

Returns the value of attribute model.



8
9
10
# File 'app/services/s3_query_service.rb', line 8

def model
  @model
end

#part_sizeObject (readonly)

Returns the value of attribute part_size.



30
31
32
# File 'app/services/s3_query_service.rb', line 30

def part_size
  @part_size
end

Class Method Details

.configurationObject



14
15
16
# File 'app/services/s3_query_service.rb', line 14

def self.configuration
  Rails.configuration.s3
end

.object_attributesObject

required, accepts ETag, Checksum, ObjectParts, StorageClass, ObjectSize



114
115
116
117
118
119
120
121
122
# File 'app/services/s3_query_service.rb', line 114

def self.object_attributes
  [
    "ETag",
    "Checksum",
    "ObjectParts",
    "StorageClass",
    "ObjectSize"
  ]
end

.post_curation_configObject



22
23
24
# File 'app/services/s3_query_service.rb', line 22

def self.post_curation_config
  configuration.post_curation
end

.pre_curation_configObject



18
19
20
# File 'app/services/s3_query_service.rb', line 18

def self.pre_curation_config
  configuration.pre_curation
end

.preservation_configObject



26
27
28
# File 'app/services/s3_query_service.rb', line 26

def self.preservation_config
  configuration.preservation
end

Instance Method Details

#access_key_idObject



97
98
99
# File 'app/services/s3_query_service.rb', line 97

def access_key_id
  S3QueryService.configuration["access_key_id"]
end

#bucket_nameObject

The name of the bucket this class is configured to use. See config/s3.yml for configuration file.



69
70
71
# File 'app/services/s3_query_service.rb', line 69

def bucket_name
  config.fetch(:bucket, nil)
end

#build_s3_object_key(filename:) ⇒ Object



148
149
150
# File 'app/services/s3_query_service.rb', line 148

def build_s3_object_key(filename:)
  "#{prefix}#{filename}"
end

#check_file(bucket:, key:) ⇒ Object



311
312
313
314
315
316
317
# File 'app/services/s3_query_service.rb', line 311

def check_file(bucket:, key:)
  client.head_object({ bucket:, key: })
rescue Aws::Errors::ServiceError => aws_service_error
  message = "An error was encountered when requesting to check the status of the AWS S3 Object in the bucket #{bucket} with the key #{key}: #{aws_service_error}"
  Rails.logger.error(message)
  raise aws_service_error
end

#clientObject



109
110
111
# File 'app/services/s3_query_service.rb', line 109

def client
  @client ||= Aws::S3::Client.new(region:, credentials:)
end

#client_s3_empty_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix) ⇒ Object



171
172
173
174
175
176
177
178
179
180
# File 'app/services/s3_query_service.rb', line 171

def client_s3_empty_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix)
  if reload # force a reload
    @client_s3_empty_files = nil
    clear_s3_responses(bucket_name:, prefix:)
  end
  @client_s3_empty_files ||= begin
    files_and_directories = get_s3_objects(bucket_name:, prefix:)
    files_and_directories.select(&:empty?)
  end
end

#client_s3_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix) ⇒ Array<S3File>

Retrieve the S3 resources uploaded to the S3 Bucket

Returns:



163
164
165
166
167
168
169
# File 'app/services/s3_query_service.rb', line 163

def client_s3_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix)
  if reload # force a reload
    @client_s3_files = nil
    clear_s3_responses(bucket_name:, prefix:)
  end
  @client_s3_files ||= get_s3_objects(bucket_name:, prefix:)
end

#configObject



46
47
48
49
50
51
52
53
54
55
56
# File 'app/services/s3_query_service.rb', line 46

def config
  if @mode == PRESERVATION
    self.class.preservation_config
  elsif @mode == POSTCURATION
    self.class.post_curation_config
  elsif @mode == PRECURATION
    self.class.pre_curation_config
  else
    raise ArgumentError, "Invalid mode value: #{@mode}"
  end
end

#copy_directory(source_key:, target_bucket:, target_key:) ⇒ Object



267
268
269
270
271
272
273
# File 'app/services/s3_query_service.rb', line 267

def copy_directory(source_key:, target_bucket:, target_key:)
  client.copy_object(copy_source: source_key, bucket: target_bucket, key: target_key)
rescue Aws::Errors::ServiceError => aws_service_error
  message = "An error was encountered when requesting to copy the AWS S3 directory Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
  Rails.logger.error(message)
  raise aws_service_error
end

#copy_file(source_key:, target_bucket:, target_key:, size:) ⇒ Object



234
235
236
237
238
239
240
241
242
243
244
245
# File 'app/services/s3_query_service.rb', line 234

def copy_file(source_key:, target_bucket:, target_key:, size:)
  Rails.logger.info("Copying #{source_key} to #{target_bucket}/#{target_key}")
  if size > part_size
    copy_multi_part(source_key:, target_bucket:, target_key:, size:)
  else
    client.copy_object(copy_source: source_key.gsub("+", "%2B"), bucket: target_bucket, key: target_key, checksum_algorithm: "SHA256")
  end
rescue Aws::Errors::ServiceError => aws_service_error
  message = "An error was encountered when requesting to copy AWS S3 Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
  Rails.logger.error(message)
  raise aws_service_error
end

#copy_multi_part(source_key:, target_bucket:, target_key:, size:) ⇒ Object



247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
# File 'app/services/s3_query_service.rb', line 247

def copy_multi_part(source_key:, target_bucket:, target_key:, size:)
  multi = client.create_multipart_upload(bucket: target_bucket, key: target_key, checksum_algorithm: "SHA256")
  part_num = 0
  start_byte = 0
  parts = []
  while start_byte < size
    part_num += 1
    end_byte = [start_byte + part_size, size].min - 1
    resp = client.upload_part_copy(bucket: target_bucket, copy_source: source_key, key: multi.key, part_number: part_num,
                                   upload_id: multi.upload_id, copy_source_range: "bytes=#{start_byte}-#{end_byte}")
    parts << { etag: resp.copy_part_result.etag, part_number: part_num, checksum_sha256: resp.copy_part_result.checksum_sha256 }
    start_byte = end_byte + 1
  end
  client.complete_multipart_upload(bucket: target_bucket, key: target_key, upload_id: multi.upload_id, multipart_upload: { parts: })
rescue Aws::Errors::ServiceError => aws_service_error
  message = "An error was encountered when requesting to multipart copy AWS S3 Object from #{source_key} to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}"
  Rails.logger.error(message)
  raise aws_service_error
end

#count_objects(bucket_name: self.bucket_name, prefix: self.prefix) ⇒ Object



326
327
328
329
# File 'app/services/s3_query_service.rb', line 326

def count_objects(bucket_name: self.bucket_name, prefix: self.prefix)
  responses = s3_responses(bucket_name:, prefix:)
  responses.reduce(0) { |total, resp| total + resp.key_count }
end

#create_directoryObject



284
285
286
287
288
289
290
# File 'app/services/s3_query_service.rb', line 284

def create_directory
  client.put_object({ bucket: bucket_name, key: prefix, content_length: 0 })
rescue Aws::Errors::ServiceError => aws_service_error
  message = "An error was encountered when requesting to create the AWS S3 directory Object in the bucket #{bucket_name} with the key #{prefix}: #{aws_service_error}"
  Rails.logger.error(message)
  raise aws_service_error
end

#credentialsObject



105
106
107
# File 'app/services/s3_query_service.rb', line 105

def credentials
  @credentials ||= Aws::Credentials.new(access_key_id, secret_access_key)
end

#data_profileObject

Returns:

  • Hash with two properties [<S3File>], ok: Bool objects is an Array of S3File objects ok is false if there is an error connecting to S3. Otherwise true.



198
199
200
201
202
203
204
# File 'app/services/s3_query_service.rb', line 198

def data_profile
  { objects: client_s3_files, ok: true }
rescue => ex
  Rails.logger.error("Error querying S3. Bucket: #{bucket_name}. DOI: #{@doi}. Exception: #{ex.message}")

  { objects: [], ok: false }
end

#delete_s3_object(s3_file_key, bucket: bucket_name) ⇒ Object



275
276
277
278
279
280
281
282
# File 'app/services/s3_query_service.rb', line 275

def delete_s3_object(s3_file_key, bucket: bucket_name)
  resp = client.delete_object({ bucket:, key: s3_file_key })
  resp.to_h
rescue Aws::Errors::ServiceError => aws_service_error
  message = "An error was encountered when requesting to delete the AWS S3 Object #{s3_file_key} in the bucket #{bucket_name}: #{aws_service_error}"
  Rails.logger.error(message)
  raise aws_service_error
end

#file_countObject



182
183
184
185
186
187
188
# File 'app/services/s3_query_service.rb', line 182

def file_count
  client_s3_files.count
rescue Aws::Errors::ServiceError => aws_service_error
  message = "An error was encountered when requesting AWS S3 Objects from the bucket #{bucket_name} with the prefix #{prefix}: #{aws_service_error}"
  Rails.logger.error(message)
  raise aws_service_error
end

#file_url(key) ⇒ Object

Public signed URL to fetch this file from the S3 (valid for a limited time)



92
93
94
95
# File 'app/services/s3_query_service.rb', line 92

def file_url(key)
  signer = Aws::S3::Presigner.new(client:)
  signer.presigned_url(:get_object, bucket: bucket_name, key:)
end

#find_s3_file(filename:) ⇒ Object



152
153
154
155
156
157
158
159
# File 'app/services/s3_query_service.rb', line 152

def find_s3_file(filename:)
  s3_object_key = build_s3_object_key(filename:)

  object = get_s3_object_attributes(key: s3_object_key)
  return if object.nil?

  S3File.new(work: model, filename: s3_object_key, last_modified: object[:last_modified], size: object[:object_size], checksum: object[:etag])
end

#get_s3_object(key:) ⇒ Object



133
134
135
136
137
138
139
140
141
142
143
144
145
146
# File 'app/services/s3_query_service.rb', line 133

def get_s3_object(key:)
  response = client.get_object({
                                 bucket: bucket_name,
                                 key:
                               })
  object = response.to_h
  return if object.empty?

  object
rescue Aws::Errors::ServiceError => aws_service_error
  message = "An error was encountered when requesting the AWS S3 Object #{key}: #{aws_service_error}"
  Rails.logger.error(message)
  raise aws_service_error
end

#get_s3_object_attributes(key:) ⇒ Object



124
125
126
127
128
129
130
131
# File 'app/services/s3_query_service.rb', line 124

def get_s3_object_attributes(key:)
  response = client.get_object_attributes({
                                            bucket: bucket_name,
                                            key:,
                                            object_attributes: self.class.object_attributes
                                          })
  response.to_h
end

#md5(io:) ⇒ Object



319
320
321
322
323
324
# File 'app/services/s3_query_service.rb', line 319

def md5(io:)
  md5 = Digest::MD5.new
  io.each(10_000) { |block| md5.update block }
  io.rewind
  md5.base64digest
end

#post_curation?Boolean

Returns:

  • (Boolean)


62
63
64
# File 'app/services/s3_query_service.rb', line 62

def post_curation?
  @mode == POSTCURATION
end

#pre_curation?Boolean

Returns:

  • (Boolean)


58
59
60
# File 'app/services/s3_query_service.rb', line 58

def pre_curation?
  @mode == PRECURATION
end

#prefixObject

The S3 prefix for this object, i.e., the address within the S3 bucket, which is based on the DOI



80
81
82
# File 'app/services/s3_query_service.rb', line 80

def prefix
  "#{@doi}/#{model.id}/"
end

#publish_files(current_user) ⇒ Object

Copies the existing files from the pre-curation bucket to the post-curation bucket. Notice that the copy process happens at AWS (i.e. the files are not downloaded and re-uploaded). Returns an array with the files that were copied.



210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# File 'app/services/s3_query_service.rb', line 210

def publish_files(current_user)
  source_bucket = S3QueryService.pre_curation_config[:bucket]
  target_bucket = S3QueryService.post_curation_config[:bucket]
  empty_files = client_s3_empty_files(reload: true, bucket_name: source_bucket)
  # Do not move the empty files, however, ensure that it is noted that the
  #   presence of empty files is specified in the provenance log.
  unless empty_files.empty?
    empty_files.each do |empty_file|
      message = "Warning: Attempted to publish empty S3 file #{empty_file.filename}."
      WorkActivity.add_work_activity(model.id, message, current_user.id, activity_type: WorkActivity::SYSTEM)
    end
  end

  files = client_s3_files(reload: true, bucket_name: source_bucket)
  snapshot = ApprovedUploadSnapshot.new(work: model)
  snapshot.store_files(files, current_user:)
  snapshot.save
  files.each do |file|
    ApprovedFileMoveJob.perform_later(work_id: model.id, source_bucket:, source_key: file.key, target_bucket:,
                                      target_key: file.key, size: file.size, snapshot_id: snapshot.id)
  end
  true
end

#regionObject



73
74
75
# File 'app/services/s3_query_service.rb', line 73

def region
  config.fetch(:region, nil)
end

#s3_addressObject

Construct an S3 address for this data set



86
87
88
# File 'app/services/s3_query_service.rb', line 86

def s3_address
  "s3://#{bucket_name}/#{prefix}"
end

#secret_access_keyObject



101
102
103
# File 'app/services/s3_query_service.rb', line 101

def secret_access_key
  S3QueryService.configuration["secret_access_key"]
end

#upload_file(io:, filename:, size:, md5_digest: nil) ⇒ Object



292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
# File 'app/services/s3_query_service.rb', line 292

def upload_file(io:, filename:, size:, md5_digest: nil)
  # upload file from io in a single request, may not exceed 5GB
  key = "#{prefix}#{filename}"
  if size > part_size
    upload_multipart_file(target_bucket: bucket_name, target_key: key, size:, io:)
  else
    md5_digest ||= md5(io:)
    @last_response = client.put_object(bucket: bucket_name, key:, body: io, content_md5: md5_digest)
  end
  key
rescue Aws::S3::Errors::SignatureDoesNotMatch => e
  Honeybadger.notify("Error Uploading file #{filename} for object: #{s3_address} Signature did not match! error: #{e}")
  false
rescue Aws::Errors::ServiceError => aws_service_error
  message = "An error was encountered when requesting to create the AWS S3 Object in the bucket #{bucket_name} with the key #{key}: #{aws_service_error}"
  Rails.logger.error(message)
  raise aws_service_error
end