Skip to content

Commit 14757b7

Browse files
feat: store content digest metadata for uploaded objects. (stjude-rust-labs#27)
* feat: store content digest metadata for uploaded objects. This commit adds calculating SHA-256 content digests for uploaded objects, using a metadata header to store a `Content-Digest` header value. It also adds `get_content_digest` to retrieve the content digest of a given URL, which issues a `HEAD` request and returns either a `Content-Digest` header value (directly or via the metadata header) or a strong `ETag` header. Adds the `--hash-algorithm` option to `cloud-copy` to specifying either `none`, `sha256`, or `blake3` for calculating content digests. * chore: update CHANGELOG. * chore: code review feedback. * Update src/lib.rs Co-authored-by: Clay McLeod <[email protected]> --------- Co-authored-by: Clay McLeod <[email protected]>
1 parent 5ec28cd commit 14757b7

File tree

13 files changed

+448
-19
lines changed

13 files changed

+448
-19
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99

1010
#### Added
1111

12+
* Added `get_content_digest` function for retrieving content digests ([#27](https://github.com/stjude-rust-labs/cloud-copy/pull/27)).
13+
* Added content digest metadata for uploads, defaulting to SHA-256 ([#27](https://github.com/stjude-rust-labs/cloud-copy/pull/27)).
14+
* Added `--hash-algorithm` to `cloud-copy` CLI for specifying the algorithm to
15+
use for attaching content digest metadata to uploaded objects ([#27](https://github.com/stjude-rust-labs/cloud-copy/pull/27)).
1216
* Added support for Azure Shared Key authentication ([#26](https://github.com/stjude-rust-labs/cloud-copy/pull/26)).
1317

1418
#### Changed

Cargo.lock

Lines changed: 41 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ cli = [
3131
[dependencies]
3232
anyhow = { version = "1.0.99", optional = true }
3333
base64 = "0.22.1"
34+
blake3 = { version = "1.8.2", features = ["mmap", "rayon"] }
3435
byte-unit = { version = "5.1.6", optional = true }
3536
bytes = "1.10.1"
3637
chrono = { version = "0.4.41", features = ["serde"] }

src/backend.rs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,5 +110,14 @@ pub trait StorageBackend {
110110
fn walk(&self, url: Url) -> impl Future<Output = Result<Vec<String>>> + Send;
111111

112112
/// Creates a new upload.
113-
fn new_upload(&self, url: Url) -> impl Future<Output = Result<Self::Upload>> + Send;
113+
///
114+
/// If `digest` is `Some`, it is expected to be a `Content-Digest` header
115+
/// value.
116+
///
117+
/// See: <https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/Content-Digest>
118+
fn new_upload(
119+
&self,
120+
url: Url,
121+
digest: Option<String>,
122+
) -> impl Future<Output = Result<Self::Upload>> + Send;
114123
}

src/backend/azure.rs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,14 @@ const AZURE_BLOB_TYPE: &str = "BlockBlob";
7777
/// The name of the root container.
7878
const AZURE_ROOT_CONTAINER: &str = "$root";
7979

80+
/// The Azure content digest header name.
81+
///
82+
/// This is the content digest for the entire blob.
83+
///
84+
/// Note: the metadata name *must* be a legal C# identifier (i.e. cannot contain
85+
/// `-`).
86+
pub(crate) const AZURE_CONTENT_DIGEST_HEADER: &str = "x-ms-meta-content_digest";
87+
8088
/// Inserts the authentication header to the request.
8189
fn insert_authentication_header(auth: &AzureAuthConfig, request: &mut Request) -> Result<()> {
8290
let signer = RequestSigner::new(auth);
@@ -251,6 +259,8 @@ pub struct AzureBlobUpload {
251259
url: Url,
252260
/// The Azure block id.
253261
block_id: Arc<String>,
262+
/// The content digest header value of the blob.
263+
digest: Option<String>,
254264
/// The channel for sending progress updates.
255265
events: Option<broadcast::Sender<TransferEvent>>,
256266
}
@@ -262,13 +272,15 @@ impl AzureBlobUpload {
262272
client: HttpClient,
263273
url: Url,
264274
block_id: Arc<String>,
275+
digest: Option<String>,
265276
events: Option<broadcast::Sender<TransferEvent>>,
266277
) -> Self {
267278
Self {
268279
config,
269280
client,
270281
url,
271282
block_id,
283+
digest,
272284
events,
273285
}
274286
}
@@ -367,6 +379,15 @@ impl Upload for AzureBlobUpload {
367379
.body(body)
368380
.build()?;
369381

382+
if let Some(digest) = &self.digest {
383+
request.headers_mut().insert(
384+
AZURE_CONTENT_DIGEST_HEADER,
385+
digest
386+
.try_into()
387+
.expect("invalid content digest header value"),
388+
);
389+
}
390+
370391
if let Some(auth) = self.config.azure().auth() {
371392
insert_authentication_header(auth, &mut request)?;
372393
}
@@ -778,7 +799,7 @@ impl StorageBackend for AzureBlobStorageBackend {
778799
Ok(paths)
779800
}
780801

781-
async fn new_upload(&self, url: Url) -> Result<Self::Upload> {
802+
async fn new_upload(&self, url: Url, digest: Option<String>) -> Result<Self::Upload> {
782803
debug_assert!(
783804
Self::is_supported_url(&self.config, &url),
784805
"{url} is not a supported Azure URL",
@@ -800,6 +821,7 @@ impl StorageBackend for AzureBlobStorageBackend {
800821
self.client.clone(),
801822
url,
802823
Arc::new(Alphanumeric::new(16).to_string()),
824+
digest,
803825
self.events.clone(),
804826
))
805827
}

src/backend/generic.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ impl StorageBackend for GenericStorageBackend {
214214
Ok(Vec::default())
215215
}
216216

217-
async fn new_upload(&self, _: Url) -> Result<Self::Upload> {
217+
async fn new_upload(&self, _: Url, _: Option<String>) -> Result<Self::Upload> {
218218
panic!("generic storage backend cannot be used for uploading");
219219
}
220220
}

src/backend/google.rs

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,15 @@ const MAX_FILE_SIZE: u64 = MAX_PART_SIZE * 1024;
6060
const GOOGLE_DATE_HEADER: &str = "x-goog-date";
6161

6262
/// The Google content SHA256 header name.
63+
///
64+
/// This is the SHA256 of each upload part for multipart uploads.
6365
const GOOGLE_CONTENT_SHA256_HEADER: &str = "x-goog-content-sha256";
6466

67+
/// The Google content digest header name.
68+
///
69+
/// This is the content digest for the entire object.
70+
pub(crate) const GOOGLE_CONTENT_DIGEST_HEADER: &str = "x-goog-meta-content-digest";
71+
6572
/// Represents a Google-specific copy operation error.
6673
#[derive(Debug, thiserror::Error)]
6774
pub enum GoogleError {
@@ -760,7 +767,7 @@ impl StorageBackend for GoogleStorageBackend {
760767
Ok(paths)
761768
}
762769

763-
async fn new_upload(&self, url: Url) -> Result<Self::Upload> {
770+
async fn new_upload(&self, url: Url, digest: Option<String>) -> Result<Self::Upload> {
764771
// See: https://cloud.google.com/storage/docs/xml-api/post-object-multipart
765772

766773
debug_assert!(
@@ -796,6 +803,15 @@ impl StorageBackend for GoogleStorageBackend {
796803
.header(GOOGLE_CONTENT_SHA256_HEADER, sha256_hex_string([]))
797804
.build()?;
798805

806+
if let Some(digest) = digest {
807+
request.headers_mut().insert(
808+
GOOGLE_CONTENT_DIGEST_HEADER,
809+
digest
810+
.try_into()
811+
.expect("invalid content digest header value"),
812+
);
813+
}
814+
799815
if let Some(auth) = self.config.google().auth() {
800816
insert_authentication_header(auth, date, &mut request)?;
801817
}

src/backend/s3.rs

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,15 @@ const MAX_FILE_SIZE: u64 = MAX_PART_SIZE * 1024;
6161
const AWS_DATE_HEADER: &str = "x-amz-date";
6262

6363
/// The AWS content SHA256 header name.
64+
///
65+
/// This is the SHA256 of each upload part for multipart uploads.
6466
const AWS_CONTENT_SHA256_HEADER: &str = "x-amz-content-sha256";
6567

68+
/// The AWS content digest header name.
69+
///
70+
/// This is the content digest for the entire object.
71+
pub(crate) const AWS_CONTENT_DIGEST_HEADER: &str = "x-amz-meta-content-digest";
72+
6673
/// Represents a S3-specific copy operation error.
6774
#[derive(Debug, thiserror::Error)]
6875
pub enum S3Error {
@@ -841,7 +848,7 @@ impl StorageBackend for S3StorageBackend {
841848
Ok(paths)
842849
}
843850

844-
async fn new_upload(&self, url: Url) -> Result<Self::Upload> {
851+
async fn new_upload(&self, url: Url, digest: Option<String>) -> Result<Self::Upload> {
845852
// See: https://docs.aws.amazon.com/AmazonS3/latest/API/API_CreateMultipartUpload.html
846853

847854
debug_assert!(
@@ -874,6 +881,15 @@ impl StorageBackend for S3StorageBackend {
874881
.header(AWS_CONTENT_SHA256_HEADER, sha256_hex_string([]))
875882
.build()?;
876883

884+
if let Some(digest) = digest {
885+
request.headers_mut().insert(
886+
AWS_CONTENT_DIGEST_HEADER,
887+
digest
888+
.try_into()
889+
.expect("invalid content digest header value"),
890+
);
891+
}
892+
877893
if let Some(auth) = self.config.s3().auth() {
878894
insert_authentication_header(auth, date, &mut request)?;
879895
}

0 commit comments

Comments
 (0)