diff --git a/src/patch/error.rs b/src/patch/error.rs index 70828629..f55f4969 100644 --- a/src/patch/error.rs +++ b/src/patch/error.rs @@ -107,6 +107,9 @@ pub(crate) enum ParsePatchErrorKind { /// Missing newline at end of line. MissingNewline, + + /// Orphaned hunk header found after trailing content. + OrphanedHunkHeader, } impl fmt::Display for ParsePatchErrorKind { @@ -132,6 +135,7 @@ impl fmt::Display for ParsePatchErrorKind { Self::UnexpectedNoNewlineMarker => "unexpected 'No newline at end of file' line", Self::UnexpectedHunkLine => "unexpected line in hunk body", Self::MissingNewline => "missing newline", + Self::OrphanedHunkHeader => "orphaned hunk header after trailing content", }; write!(f, "{msg}") } diff --git a/src/patch/mod.rs b/src/patch/mod.rs index 3a182506..e228f708 100644 --- a/src/patch/mod.rs +++ b/src/patch/mod.rs @@ -16,6 +16,30 @@ use crate::utils::{byte_needs_quoting, fmt_escaped_byte, write_escaped_byte}; const NO_NEWLINE_AT_EOF: &str = "\\ No newline at end of file"; /// Representation of all the differences between two files +/// +/// # Parsing modes +/// +/// `Patch` provides two parsing modes with different strictness levels, +/// modeled after the behavior of GNU patch and `git apply`: +/// +/// | Scenario | GNU patch | git apply | [`from_str`] | [`from_str_strict`] | +/// |-----------------------------------|-------------|-----------|--------------|---------------------| +/// | Junk after all hunks are complete | Ignores | Ignores | Ignores | Ignores | +/// | Junk between hunks | Ignores[^1] | Errors | Ignores[^1] | Errors | +/// +/// [^1]: "Ignores" here means silently stopping at the junk. +/// Only hunks before it are parsed; later hunks are dropped. +/// +/// [`from_str`] and [`from_bytes`] follow GNU patch behavior, +/// silently ignoring non-patch content after a hunk's line counts are satisfied. +/// +/// [`from_str_strict`] and [`from_bytes_strict`] follow `git apply` behavior, +/// additionally rejecting orphaned hunk headers hidden behind trailing content. +/// +/// [`from_str`]: Patch::from_str +/// [`from_bytes`]: Patch::from_bytes +/// [`from_str_strict`]: Patch::from_str_strict +/// [`from_bytes_strict`]: Patch::from_bytes_strict #[derive(PartialEq, Eq)] pub struct Patch<'a, T: ToOwned + ?Sized> { // TODO GNU patch is able to parse patches without filename headers. @@ -108,6 +132,15 @@ impl<'a> Patch<'a, str> { pub fn from_str(s: &'a str) -> Result, ParsePatchError> { parse::parse(s) } + + /// Parse a `Patch` from a string in strict mode + /// + /// Unlike [`Patch::from_str`], + /// this rejects orphaned hunk headers hidden after trailing content, + /// matching `git apply` behavior. + pub fn from_str_strict(s: &'a str) -> Result, ParsePatchError> { + parse::parse_strict(s) + } } impl<'a> Patch<'a, [u8]> { @@ -115,6 +148,15 @@ impl<'a> Patch<'a, [u8]> { pub fn from_bytes(s: &'a [u8]) -> Result, ParsePatchError> { parse::parse_bytes(s) } + + /// Parse a `Patch` from bytes in strict mode + /// + /// Unlike [`Patch::from_bytes`], + /// this rejects orphaned hunk headers hidden after trailing content, + /// matching `git apply` behavior. + pub fn from_bytes_strict(s: &'a [u8]) -> Result, ParsePatchError> { + parse::parse_bytes_strict(s) + } } impl Clone for Patch<'_, T> { diff --git a/src/patch/parse.rs b/src/patch/parse.rs index 1176222b..fda46075 100644 --- a/src/patch/parse.rs +++ b/src/patch/parse.rs @@ -65,6 +65,19 @@ pub fn parse(input: &str) -> Result> { )) } +pub fn parse_strict(input: &str) -> Result> { + let mut parser = Parser::new(input); + let header = patch_header(&mut parser)?; + let hunks = hunks(&mut parser)?; + reject_orphaned_hunk_headers(&mut parser)?; + + Ok(Patch::new( + header.0.map(convert_cow_to_str), + header.1.map(convert_cow_to_str), + hunks, + )) +} + pub fn parse_bytes(input: &[u8]) -> Result> { let mut parser = Parser::new(input); let header = patch_header(&mut parser)?; @@ -73,6 +86,15 @@ pub fn parse_bytes(input: &[u8]) -> Result> { Ok(Patch::new(header.0, header.1, hunks)) } +pub fn parse_bytes_strict(input: &[u8]) -> Result> { + let mut parser = Parser::new(input); + let header = patch_header(&mut parser)?; + let hunks = hunks(&mut parser)?; + reject_orphaned_hunk_headers(&mut parser)?; + + Ok(Patch::new(header.0, header.1, hunks)) +} + // This is only used when the type originated as a utf8 string fn convert_cow_to_str(cow: Cow<'_, [u8]>) -> Cow<'_, str> { match cow { @@ -154,9 +176,26 @@ fn verify_hunks_in_order(hunks: &[Hunk<'_, T>]) -> bool { true } +/// Scans remaining lines for orphaned `@@ ` hunk headers. +/// +/// In strict mode (git-apply behavior), trailing junk is allowed but +/// an `@@ ` line hiding behind that junk indicates a lost hunk. +fn reject_orphaned_hunk_headers(parser: &mut Parser<'_, T>) -> Result<()> { + while let Some(line) = parser.peek() { + if line.starts_with("@@ ") { + return Err(parser.error(ParsePatchErrorKind::OrphanedHunkHeader)); + } + parser.next()?; + } + Ok(()) +} + fn hunks<'a, T: Text + ?Sized>(parser: &mut Parser<'a, T>) -> Result>> { let mut hunks = Vec::new(); - while parser.peek().is_some() { + // Following GNU patch behavior: stop at non-@@ content. + // Any trailing content (including hidden @@ headers) is silently ignored. + // This is more permissive than git apply, which errors on junk between hunks. + while parser.peek().is_some_and(|line| line.starts_with("@@ ")) { hunks.push(hunk(parser)?); } @@ -173,13 +212,7 @@ fn hunk<'a, T: Text + ?Sized>(parser: &mut Parser<'a, T>) -> Result> let header_line = parser.next()?; let (range1, range2, function_context) = hunk_header(header_line).map_err(|e| parser.error_at(e.kind, hunk_start))?; - let lines = hunk_lines(parser)?; - - // check counts of lines to see if they match the ranges in the hunk header - let (len1, len2) = super::hunk_lines_count(&lines); - if len1 != range1.len || len2 != range2.len { - return Err(parser.error_at(ParsePatchErrorKind::HunkMismatch, hunk_start)); - } + let lines = hunk_lines(parser, range1.len, range2.len, hunk_start)?; Ok(Hunk::new(range1, range2, function_context, lines)) } @@ -223,36 +256,61 @@ fn range(s: &T) -> Result { Ok(HunkRange::new(start, len)) } -fn hunk_lines<'a, T: Text + ?Sized>(parser: &mut Parser<'a, T>) -> Result>> { +fn hunk_lines<'a, T: Text + ?Sized>( + parser: &mut Parser<'a, T>, + expected_old: usize, + expected_new: usize, + hunk_start: usize, +) -> Result>> { let mut lines: Vec> = Vec::new(); let mut no_newline_context = false; let mut no_newline_delete = false; let mut no_newline_insert = false; + let mut old_count = 0; + let mut new_count = 0; + while let Some(line) = parser.peek() { + let hunk_complete = old_count >= expected_old && new_count >= expected_new; + let line = if line.starts_with("@") { break; } else if no_newline_context { + if hunk_complete { + break; + } return Err(parser.error(ParsePatchErrorKind::ExpectedEndOfHunk)); } else if let Some(line) = line.strip_prefix(" ") { + if hunk_complete { + break; + } Line::Context(line) } else if line.starts_with("\n") { + if hunk_complete { + break; + } Line::Context(*line) } else if let Some(line) = line.strip_prefix("-") { if no_newline_delete { return Err(parser.error(ParsePatchErrorKind::TooManyDeletedLines)); } + if hunk_complete { + break; + } Line::Delete(line) } else if let Some(line) = line.strip_prefix("+") { if no_newline_insert { return Err(parser.error(ParsePatchErrorKind::TooManyInsertedLines)); } + if hunk_complete { + break; + } Line::Insert(line) } else if line.starts_with(NO_NEWLINE_AT_EOF) { let last_line = lines .pop() .ok_or_else(|| parser.error(ParsePatchErrorKind::UnexpectedNoNewlineMarker))?; - match last_line { + let modified = match last_line { Line::Context(line) => { no_newline_context = true; Line::Context(strip_newline(line)?) @@ -265,15 +323,38 @@ fn hunk_lines<'a, T: Text + ?Sized>(parser: &mut Parser<'a, T>) -> Result { + old_count += 1; + new_count += 1; + } + Line::Delete(_) => { + old_count += 1; + } + Line::Insert(_) => { + new_count += 1; + } + } + lines.push(line); parser.next()?; } + if old_count != expected_old || new_count != expected_new { + return Err(parser.error_at(ParsePatchErrorKind::HunkMismatch, hunk_start)); + } + Ok(lines) } diff --git a/src/patch/tests.rs b/src/patch/tests.rs index 235040c2..19671fd5 100644 --- a/src/patch/tests.rs +++ b/src/patch/tests.rs @@ -1,5 +1,245 @@ use super::error::ParsePatchErrorKind; -use super::parse::{parse, parse_bytes}; +use super::parse::{parse, parse_bytes, parse_bytes_strict, parse_strict}; + +#[test] +fn trailing_garbage_after_complete_hunk() { + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-old line ++new line +this is trailing garbage +that should be ignored +"; + let patch = parse(s).unwrap(); + assert_eq!(patch.hunks().len(), 1); + assert_eq!(patch.hunks()[0].old_range().len(), 1); + assert_eq!(patch.hunks()[0].new_range().len(), 1); +} + +#[test] +fn garbage_before_hunk_complete_fails() { + // If hunk line count isn't satisfied, garbage causes error + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1,3 +1,3 @@ +-line 1 ++LINE 1 +garbage before hunk complete + line 3 +"; + assert_eq!( + parse(s).unwrap_err().kind, + ParsePatchErrorKind::UnexpectedHunkLine, + ); +} + +#[test] +fn git_headers_after_hunk_ignored() { + // Git extended headers appearing after a complete hunk should be ignored + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-old ++new +diff --git a/other.txt b/other.txt +index 1234567..89abcdef 100644 +"; + let patch = parse(s).unwrap(); + assert_eq!(patch.hunks().len(), 1); +} + +/// When splitting multi-patch input by `---/+++` boundaries, trailing +/// `diff --git` lines from the next patch may linger. If the last hunk +/// ends with `\ No newline at end of file`, the parser should still +/// recognize the hunk as complete and ignore the trailing content, +/// as GNU patch does. +/// +/// Pattern first appeared in rust-lang/cargo@b119b891df93f128abef634215cd8f967c3cd120 +/// where HTML files lost their trailing newlines. +#[test] +fn no_newline_at_eof_followed_by_trailing_garbage() { + let s = "\ +--- a/file.html ++++ b/file.html +@@ -1,3 +1,3 @@ +
+-

old

++

new

+
+\\ No newline at end of file +diff --git a/other.html b/other.html +index 1234567..89abcdef 100644 +"; + let patch = parse(s).unwrap(); + assert_eq!(patch.hunks().len(), 1); + assert_eq!(patch.hunks()[0].old_range().len(), 3); + assert_eq!(patch.hunks()[0].new_range().len(), 3); +} + +#[test] +fn multi_hunk_with_trailing_garbage() { + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-a ++A +@@ -5 +5 @@ +-b ++B +some trailing garbage +"; + let patch = parse(s).unwrap(); + assert_eq!(patch.hunks().len(), 2); +} + +#[test] +fn garbage_between_hunks_stops_parsing() { + // GNU patch would try to parse the second @@ as a new patch + // and fail because there's no `---` header. + // + // diffy `Patch` is a single patch parser, so should just ignore everything + // after the first complete hunk when garbage is encountered. + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-a ++A +not a hunk line +@@ -5 +5 @@ +-b ++B +"; + let patch = parse(s).unwrap(); + // Only first hunk is parsed; second @@ is ignored as garbage + assert_eq!(patch.hunks().len(), 1); +} + +#[test] +fn context_lines_counted_correctly() { + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1,4 +1,4 @@ + context 1 +-deleted ++inserted + context 2 + context 3 +trailing garbage +"; + let patch = parse(s).unwrap(); + assert_eq!(patch.hunks().len(), 1); + assert_eq!(patch.hunks()[0].old_range().len(), 4); + assert_eq!(patch.hunks()[0].new_range().len(), 4); +} + +// Strict mode (git-apply behavior): rejects orphaned hunk headers +// hidden behind trailing content, but allows plain trailing junk. +mod strict_mode { + use super::*; + + #[test] + fn trailing_junk_allowed() { + // git apply accepts trailing junk after all hunks + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-old ++new +this is trailing garbage +"; + let patch = parse_strict(s).unwrap(); + assert_eq!(patch.hunks().len(), 1); + } + + #[test] + fn trailing_junk_allowed_bytes() { + let s = b"\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-old ++new +this is trailing garbage +"; + let patch = parse_bytes_strict(&s[..]).unwrap(); + assert_eq!(patch.hunks().len(), 1); + } + + #[test] + fn orphaned_hunk_header_after_junk() { + // Junk between hunks hides the second @@ — strict rejects this + // since git apply errors with "patch fragment without header". + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-a ++A +not a hunk line +@@ -5 +5 @@ +-b ++B +"; + assert_eq!( + parse_strict(s).unwrap_err().kind, + ParsePatchErrorKind::OrphanedHunkHeader, + ); + } + + #[test] + fn no_junk_parses_normally() { + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-old ++new +"; + let patch = parse_strict(s).unwrap(); + assert_eq!(patch.hunks().len(), 1); + } + + #[test] + fn multi_hunk_no_junk() { + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-a ++A +@@ -5 +5 @@ +-b ++B +"; + let patch = parse_strict(s).unwrap(); + assert_eq!(patch.hunks().len(), 2); + } + + #[test] + fn garbage_before_hunk_complete_fails() { + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1,3 +1,3 @@ +-line 1 ++LINE 1 +garbage before hunk complete + line 3 +"; + assert_eq!( + parse_strict(s).unwrap_err().kind, + ParsePatchErrorKind::UnexpectedHunkLine, + ); + } +} #[test] fn test_escaped_filenames() {