diff options
Diffstat (limited to 'llvm_tools/patch_sync/src/patch_parsing.rs')
-rw-r--r-- | llvm_tools/patch_sync/src/patch_parsing.rs | 313 |
1 files changed, 313 insertions, 0 deletions
diff --git a/llvm_tools/patch_sync/src/patch_parsing.rs b/llvm_tools/patch_sync/src/patch_parsing.rs new file mode 100644 index 00000000..733451ae --- /dev/null +++ b/llvm_tools/patch_sync/src/patch_parsing.rs @@ -0,0 +1,313 @@ +use std::collections::{BTreeMap, BTreeSet}; +use std::fs::{copy, File}; +use std::io::{BufRead, BufReader, Read, Write}; +use std::path::{Path, PathBuf}; + +use anyhow::{anyhow, Context, Result}; +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; + +/// JSON serde struct. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PatchDictSchema { + pub rel_patch_path: String, + pub start_version: Option<u64>, + pub end_version: Option<u64>, + pub platforms: BTreeSet<String>, + pub metadata: Option<BTreeMap<String, serde_json::Value>>, +} + +/// Struct to keep track of patches and their relative paths. +#[derive(Debug, Clone)] +pub struct PatchCollection { + pub patches: Vec<PatchDictSchema>, + pub workdir: PathBuf, +} + +impl PatchCollection { + /// Create a `PatchCollection` from a PATCHES. + pub fn parse_from_file(json_file: &Path) -> Result<Self> { + Ok(Self { + patches: serde_json::from_reader(File::open(json_file)?)?, + workdir: json_file + .parent() + .ok_or_else(|| anyhow!("failed to get json_file parent"))? + .to_path_buf(), + }) + } + + /// Create a `PatchCollection` from a string literal and a workdir. + pub fn parse_from_str(workdir: PathBuf, contents: &str) -> Result<Self> { + Ok(Self { + patches: serde_json::from_str(contents).context("parsing from str")?, + workdir, + }) + } + + #[allow(dead_code)] + /// Return true if the collection is tracking any patches. + pub fn is_empty(&self) -> bool { + self.patches.is_empty() + } + + /// Compute the set-set subtraction, returning a new `PatchCollection` which + /// keeps the minuend's wordir. + pub fn subtract(&self, subtrahend: &Self) -> Result<Self> { + let mut new_patches = Vec::new(); + // This is O(n^2) when it could be much faster, but n is always going to be less + // than 1k and speed is not important here. + for our_patch in &self.patches { + let found_in_sub = subtrahend.patches.iter().any(|sub_patch| { + let hash1 = subtrahend + .hash_from_rel_patch(sub_patch) + .expect("getting hash from subtrahend patch"); + let hash2 = self + .hash_from_rel_patch(our_patch) + .expect("getting hash from our patch"); + hash1 == hash2 + }); + if !found_in_sub { + new_patches.push(our_patch.clone()); + } + } + Ok(Self { + patches: new_patches, + workdir: self.workdir.clone(), + }) + } + + pub fn union(&self, other: &Self) -> Result<Self> { + self.union_helper( + other, + |p| self.hash_from_rel_patch(p), + |p| other.hash_from_rel_patch(p), + ) + } + + fn union_helper( + &self, + other: &Self, + our_hash_f: impl Fn(&PatchDictSchema) -> Result<String>, + their_hash_f: impl Fn(&PatchDictSchema) -> Result<String>, + ) -> Result<Self> { + // 1. For all our patches: + // a. If there exists a matching patch hash from `other`: + // i. Create a new patch with merged platform info, + // ii. add the new patch to our new collection. + // iii. Mark the other patch as "merged" + // b. Otherwise, copy our patch to the new collection + // 2. For all unmerged patches from the `other` + // a. Copy their patch into the new collection + let mut combined_patches = Vec::new(); + let mut other_merged = vec![false; other.patches.len()]; + + // 1. + for p in &self.patches { + let our_hash = our_hash_f(p)?; + let mut found = false; + // a. + for (idx, merged) in other_merged.iter_mut().enumerate() { + if !*merged { + let other_p = &other.patches[idx]; + let their_hash = their_hash_f(other_p)?; + if our_hash == their_hash { + // i. + let new_platforms = + p.platforms.union(&other_p.platforms).cloned().collect(); + // ii. + combined_patches.push(PatchDictSchema { + rel_patch_path: p.rel_patch_path.clone(), + start_version: p.start_version, + end_version: p.end_version, + platforms: new_platforms, + metadata: p.metadata.clone(), + }); + // iii. + *merged = true; + found = true; + break; + } + } + } + // b. + if !found { + combined_patches.push(p.clone()); + } + } + // 2. + // Add any remaining, other-only patches. + for (idx, merged) in other_merged.iter().enumerate() { + if !*merged { + combined_patches.push(other.patches[idx].clone()); + } + } + + Ok(Self { + workdir: self.workdir.clone(), + patches: combined_patches, + }) + } + + /// Copy all patches from this collection into another existing collection, and write that + /// to the existing collection's file. + pub fn transpose_write(&self, existing_collection: &mut Self) -> Result<()> { + for p in &self.patches { + let original_file_path = self.workdir.join(&p.rel_patch_path); + let copy_file_path = existing_collection.workdir.join(&p.rel_patch_path); + copy_create_parents(&original_file_path, ©_file_path)?; + existing_collection.patches.push(p.clone()); + } + existing_collection.write_patches_json("PATCHES.json") + } + + /// Write out the patch collection contents to a PATCHES.json file. + fn write_patches_json(&self, filename: &str) -> Result<()> { + let write_path = self.workdir.join(filename); + let mut new_patches_file = File::create(&write_path) + .with_context(|| format!("writing to {}", write_path.display()))?; + new_patches_file.write_all(self.serialize_patches()?.as_bytes())?; + Ok(()) + } + + pub fn serialize_patches(&self) -> Result<String> { + let mut serialization_buffer = Vec::<u8>::new(); + // Four spaces to indent json serialization. + let mut serializer = serde_json::Serializer::with_formatter( + &mut serialization_buffer, + serde_json::ser::PrettyFormatter::with_indent(b" "), + ); + self.patches + .serialize(&mut serializer) + .context("serializing patches to JSON")?; + // Append a newline at the end if not present. This is necessary to get + // past some pre-upload hooks. + if serialization_buffer.last() != Some(&b'\n') { + serialization_buffer.push(b'\n'); + } + Ok(std::str::from_utf8(&serialization_buffer)?.to_string()) + } + + fn hash_from_rel_patch(&self, patch: &PatchDictSchema) -> Result<String> { + hash_from_patch_path(&self.workdir.join(&patch.rel_patch_path)) + } +} + +/// Get the hash from the patch file contents. +/// +/// Not every patch file actually contains its own hash, +/// we must compute the hash ourselves when it's not found. +fn hash_from_patch(patch_contents: impl Read) -> Result<String> { + let mut reader = BufReader::new(patch_contents); + let mut buf = String::new(); + reader.read_line(&mut buf)?; + let mut first_line_iter = buf.trim().split(' ').fuse(); + let (fst_word, snd_word) = (first_line_iter.next(), first_line_iter.next()); + if let (Some("commit" | "From"), Some(hash_str)) = (fst_word, snd_word) { + // If the first line starts with either "commit" or "From", the following + // text is almost certainly a commit hash. + Ok(hash_str.to_string()) + } else { + // This is an annoying case where the patch isn't actually a commit. + // So we'll hash the entire file, and hope that's sufficient. + let mut hasher = Sha256::new(); + hasher.update(&buf); // Have to hash the first line. + reader.read_to_string(&mut buf)?; + hasher.update(buf); // Hash the rest of the file. + let sha = hasher.finalize(); + Ok(format!("{:x}", &sha)) + } +} + +fn hash_from_patch_path(patch: &Path) -> Result<String> { + let f = File::open(patch)?; + hash_from_patch(f) +} + +/// Copy a file from one path to another, and create any parent +/// directories along the way. +fn copy_create_parents(from: &Path, to: &Path) -> Result<()> { + let to_parent = to + .parent() + .with_context(|| format!("getting parent of {}", to.display()))?; + if !to_parent.exists() { + std::fs::create_dir_all(to_parent)?; + } + + copy(&from, &to) + .with_context(|| format!("copying file from {} to {}", &from.display(), &to.display()))?; + Ok(()) +} + +#[cfg(test)] +mod test { + use super::*; + + /// Test we can extract the hash from patch files. + #[test] + fn test_hash_from_patch() { + // Example git patch from Gerrit + let desired_hash = "004be4037e1e9c6092323c5c9268acb3ecf9176c"; + let test_file_contents = "commit 004be4037e1e9c6092323c5c9268acb3ecf9176c\n\ + Author: An Author <some_email>\n\ + Date: Thu Aug 6 12:34:16 2020 -0700"; + assert_eq!( + &hash_from_patch(test_file_contents.as_bytes()).unwrap(), + desired_hash + ); + + // Example git patch from upstream + let desired_hash = "6f85225ef3791357f9b1aa097b575b0a2b0dff48"; + let test_file_contents = "From 6f85225ef3791357f9b1aa097b575b0a2b0dff48\n\ + Mon Sep 17 00:00:00 2001\n\ + From: Another Author <another_email>\n\ + Date: Wed, 18 Aug 2021 15:03:03 -0700"; + assert_eq!( + &hash_from_patch(test_file_contents.as_bytes()).unwrap(), + desired_hash + ); + } + + #[test] + fn test_union() { + let patch1 = PatchDictSchema { + start_version: Some(0), + end_version: Some(1), + rel_patch_path: "a".into(), + metadata: None, + platforms: BTreeSet::from(["x".into()]), + }; + let patch2 = PatchDictSchema { + rel_patch_path: "b".into(), + platforms: BTreeSet::from(["x".into(), "y".into()]), + ..patch1.clone() + }; + let patch3 = PatchDictSchema { + platforms: BTreeSet::from(["z".into(), "x".into()]), + ..patch1.clone() + }; + let collection1 = PatchCollection { + workdir: PathBuf::new(), + patches: vec![patch1, patch2], + }; + let collection2 = PatchCollection { + workdir: PathBuf::new(), + patches: vec![patch3], + }; + let union = collection1 + .union_helper( + &collection2, + |p| Ok(p.rel_patch_path.to_string()), + |p| Ok(p.rel_patch_path.to_string()), + ) + .expect("could not create union"); + assert_eq!(union.patches.len(), 2); + assert_eq!( + union.patches[0].platforms.iter().collect::<Vec<&String>>(), + vec!["x", "z"] + ); + assert_eq!( + union.patches[1].platforms.iter().collect::<Vec<&String>>(), + vec!["x", "y"] + ); + } +} |