Skip to content

Commit f6afb4f

Browse files
committed
rewrite whitespace-only
1 parent 2e5ad07 commit f6afb4f

File tree

5 files changed

+1008
-28
lines changed

5 files changed

+1008
-28
lines changed

CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,4 +84,14 @@ add_test(NAME lws5
8484
-P ${CMAKE_CURRENT_SOURCE_DIR}/tests/runtest.cmake
8585
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests/5)
8686

87+
add_test(NAME lws6
88+
COMMAND ${CMAKE_COMMAND}
89+
-DCMD=$<TARGET_FILE:${PROJECT_NAME}>
90+
-DSRC=b-comms.c
91+
-DPATCH=gemini.patch
92+
-DEXPSHA=6ea83a67aba0358099752cfaf83a28d5d983b50855e93352ae9c04d656c7911e
93+
-DEXPSHA_WIN=2e6b9b12ae0128c9edfc109744b9c67848712b0521c322a45104895aa4cbc3b1
94+
-P ${CMAKE_CURRENT_SOURCE_DIR}/tests/runtest.cmake
95+
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests/6)
96+
8797

README.md

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# fixdiff
22

3-
Andy Green <[email protected]> 2025
4-
See MIT license in LICENSE
3+
Copyright (C) 2025 Andy Green <[email protected]>
4+
Licensed under MIT license, see LICENSE
55

66
```
77
$ cat llm-patch.diff | fixdiff | patch -p1
@@ -24,22 +24,25 @@ $ cat llm-patch.diff | fixdiff /path/to/sources | patch -p1
2424

2525
LLM find it hard to generate diff headers with correct line counts or even
2626
line offsets, although some LLMs are smart enough to produce otherwise
27-
legible diffs.
27+
legible diffs. Often the content or just the context lines around the
28+
changes are not quite right.
2829

2930
This utility adjusts the diff stanzas sent to it on stdin and produces new stanza
3031
headers with accurate line counts on stdout.
3132

3233
It silently repairs:
3334

34-
- added empty lines with only whitespace become blank lines
35-
- wrong "before" line in original stanza header
36-
- wrong "before" line count in original stanza header
37-
- wrong "after" line in original stanza header
38-
- wrong "after" line count in original stanza header
39-
- removes extra lead-in context lines in stanza
40-
- for diffs adding to end of file, corrects mismatching context caused by
41-
LLM losing blank lines at the original EOF (by checking the original
42-
source file for extra lines and adding them to the stanza as context)
35+
1. new empty lines with only whitespace, by rewriting to blank lines
36+
2. original lines in diff that differ from real line in file only by
37+
whitespace are rewritten to contain the correct whitespace
38+
3. wrong "before" line in original stanza header
39+
4. wrong "before" line count in original stanza header
40+
5. wrong "after" line in original stanza header
41+
6. wrong "after" line count in original stanza header
42+
7. extra lead-in context lines to stanza by removing until only 3
43+
8. diffs adding to end of file with missing or wrong context caused by
44+
LLM losing blank lines at the original EOF are rewritten by checking
45+
the original source file for extra lines and adding them to the stanza as context)
4346

4447
It finds and scans the sources the patches apply to and uses the diff stanza to
4548
find the original line it applied to by itself, along with the original line

fixdiff.c

Lines changed: 170 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -84,11 +84,21 @@ typedef struct {
8484
int li;
8585
} lbuf_t;
8686

87+
typedef struct rewriter {
88+
struct rewriter *next;
89+
size_t len;
90+
int line;
91+
char *text;
92+
} rewriter_t;
93+
/* new_text is overcommitted below */
94+
8795
typedef struct {
8896
off_t flo;
8997

9098
const char *reason;
9199

100+
rewriter_t *rewriter_head;
101+
92102
dss_t d;
93103
int pre;
94104
int post;
@@ -102,6 +112,8 @@ typedef struct {
102112

103113
int fd_temp;
104114

115+
int li_out;
116+
105117
char ongoing;
106118
char skip_this_one;
107119
char lead_in_active;
@@ -313,10 +325,26 @@ fixdiff_stanza_start(dp_t *pdp, char *sh, size_t len)
313325
return 0;
314326
}
315327

328+
static void
329+
stain_copy(char *dest, const char *in, size_t len)
330+
{
331+
char *p = dest;
332+
333+
strncpy(dest, in, len - 1);
334+
dest[len - 1] = '\0';
335+
do {
336+
p = strchr(p, '\t');
337+
if (!p)
338+
break;
339+
*p = '>';
340+
p++;
341+
} while (1);
342+
}
343+
316344
static int
317345
fixdiff_find_original(dp_t *pdp, int *line_start)
318346
{
319-
char in_src[4096], in_temp[4096], b1[256], b2[256], hit = 0;
347+
char in_src[4096], in_temp[4096], b1[256], b2[256], f1[256], f2[256], hit = 0;
320348
int ret = 1, mc = 0, lmc = 0, lis = 0, lg_lis = 0;
321349
lbuf_t lb_temp, lb_src, lb;
322350
size_t lt, ls;
@@ -329,6 +357,8 @@ fixdiff_find_original(dp_t *pdp, int *line_start)
329357
lb_src.fd = lb.fd = -1;
330358
b1[0] = '\0';
331359
b2[0] = '\0';
360+
f1[0] = '\0';
361+
f2[0] = '\0';
332362

333363
init_lbuf(&lb_temp, "temp");
334364
lb_temp.fd = open(pdp->temp, OFLAGS(O_RDWR));
@@ -402,26 +432,105 @@ fixdiff_find_original(dp_t *pdp, int *line_start)
402432
break;
403433

404434
if (!ls) {
405-
elog("failed to match, best chunk %d lines at %s:%d\n",
435+
elog("failed to match, best chunk %d lines at %s:%d (tabs shown below as >)\n",
406436
lmc, pdp->pf, lg_lis);
407-
elog("patch: '%s', source '%s'\n", b1, b2);
437+
elog("last match: patch = '%s"
438+
"', source = '%s'\n", b1, b2);
439+
elog("divergence: patch = '%s"
440+
"', source = '%s'\n", f1, f2);
408441
mc = 0;
409442
break;
410443
}
411444

412445
if (fixdiff_strcmp(in_temp + 1, lt - 1, &let, in_src, ls, &les)) {
413-
if (mc > pdp->pre + pdp->post)
414-
elog("match failed after %d: '%s' / '%s'", mc, in_temp + 1, in_src);
446+
/*
447+
* It's not a match.
448+
*
449+
* It's still possible we only differ by whitespace.
450+
* Does it match if we treat any whitespace as a single
451+
* whitespace match token?
452+
*/
453+
454+
char *p1 = in_temp + 1, *p1_end = p1 + lt - 1 - (int)let,
455+
*p2 = in_src, *p2_end = p2 + ls - (int)les;
456+
457+
while (p1 < p1_end && p2 < p2_end) {
458+
char wst1 = 0, wst2 = 0;
459+
460+
while (*p1 == ' ' || *p1 == '\t' && p1 < p1_end) {
461+
p1++;
462+
wst1 = 1;
463+
}
464+
while (*p2 == ' ' || *p2 == '\t' && p2 < p2_end) {
465+
p2++;
466+
wst2 = 1;
467+
}
468+
469+
if (wst1 != wst2)
470+
goto record_breakage;
471+
472+
if (*p1 != *p2)
473+
goto record_breakage;
474+
475+
p1++;
476+
p2++;
477+
}
478+
479+
if ((p1 < p1_end) != (p2 < p2_end))
480+
goto record_breakage;
481+
482+
elog("(fixable whitespace-only difference at stanza line %d)\n", lb_temp.li);
483+
484+
/*
485+
* We have to take care about picking up windows _TEXT
486+
* CRLF, eliminating that if present and only putting
487+
* the LF, so rewritten lines are indistinguishable
488+
*/
489+
490+
{
491+
size_t rlen = ls + 1 - les + 1;
492+
rewriter_t *rwt = malloc(sizeof(*rwt) + rlen + 1);
493+
494+
if (!rwt) {
495+
elog("OOM\n");
496+
return -1;
497+
}
498+
rwt->next = pdp->rewriter_head;
499+
pdp->rewriter_head = rwt;
500+
rwt->line = lb_temp.li;
501+
rwt->text = (char *)&rwt[1];
502+
rwt->text[0] = *in_temp;
503+
rwt->len = rlen;
504+
memcpy(rwt->text + 1, in_src, ls);
505+
rwt->text[1 + ls] = '\n';
506+
}
507+
goto allow_match_ws;
508+
509+
record_breakage:
510+
if (mc + 1 > lmc) {
511+
stain_copy(f1, in_temp + 1, sizeof(f1));
512+
stain_copy(f2, in_src, sizeof(f2));
513+
}
415514
mc = 0;
515+
{
516+
rewriter_t *rwt = pdp->rewriter_head, *rwt1;
517+
518+
while (rwt) {
519+
rwt1 = rwt->next;
520+
free(rwt);
521+
rwt = rwt1;
522+
}
523+
524+
pdp->rewriter_head = NULL;
525+
}
416526
break;
417527
}
418528

529+
allow_match_ws:
419530
mc++;
420531
if (mc > lmc) {
421-
strncpy(b1, in_temp + 1, sizeof(b1) - 1);
422-
b1[sizeof(b1) - 1] = '\0';
423-
strncpy(b2, in_src + 1, sizeof(b2) - 1);
424-
b2[sizeof(b2) - 1] = '\0';
532+
stain_copy(b1, in_temp + 1, sizeof(b1));
533+
stain_copy(b2, in_src, sizeof(b2));
425534
lmc++;
426535
lg_lis = lis;
427536
}
@@ -512,8 +621,9 @@ fixdiff_find_original(dp_t *pdp, int *line_start)
512621
static int
513622
fixdiff_stanza_end(dp_t *pdp)
514623
{
624+
int orig, nope = 0;
625+
lbuf_t lb_temp;
515626
char buf[256];
516-
int orig;
517627

518628
if (!pdp->ongoing)
519629
return 0;
@@ -554,21 +664,64 @@ fixdiff_stanza_end(dp_t *pdp)
554664

555665
/* dump the temp side-buffer into stdout */
556666

557-
lseek(pdp->fd_temp, pdp->flo, SEEK_SET);
667+
init_lbuf(&lb_temp, "lb_temp");
668+
lb_temp.fd = open(pdp->temp, OFLAGS(O_RDONLY));
669+
lseek(lb_temp.fd, pdp->flo, SEEK_SET);
670+
558671
while (1) {
559-
ssize_t l = read(pdp->fd_temp, buf, sizeof(buf));
672+
char buf[4096];
673+
ssize_t l = fixdiff_get_line(&lb_temp, buf, sizeof(buf));
674+
rewriter_t *rwt = pdp->rewriter_head;
675+
560676
if (!l)
561677
break;
562678

563-
if (write(1, buf, TO_POSLEN(l)) != (ssize_t)l) {
564-
pdp->reason = "failed to write to stdout";
565-
return 1;
679+
// elog("dumping %d (len %d)\n", (int)pdp->li_out, (int)l);
680+
681+
while (rwt) {
682+
// elog("%d %d\n", rwt->line, pdp->li_out);
683+
if (rwt->line == lb_temp.li /*pdp->li_out*/) /* we need to rewrite this line */
684+
break;
685+
686+
rwt = rwt->next;
687+
}
688+
689+
if (rwt) {
690+
// elog("rewriting '%.*s' to '%.*s'\n", (int)l, buf, (int)rwt->len, rwt->text);
691+
if (write(1, rwt->text, TO_POSLEN(rwt->len)) != (ssize_t)rwt->len) {
692+
pdp->reason = "failed to write to stdout";
693+
nope = 1;
694+
break;
695+
}
696+
} else {
697+
if (write(1, buf, TO_POSLEN(l)) != (ssize_t)l) {
698+
pdp->reason = "failed to write to stdout";
699+
nope = 1;
700+
break;
701+
}
702+
}
703+
704+
pdp->li_out++;
705+
}
706+
707+
{
708+
rewriter_t *rwt = pdp->rewriter_head, *rwt1;
709+
710+
while (rwt) {
711+
rwt1 = rwt->next;
712+
free(rwt);
713+
rwt = rwt1;
566714
}
715+
716+
pdp->rewriter_head = NULL;
567717
}
568718

569-
close(pdp->fd_temp);
719+
close(lb_temp.fd);
570720
pdp->fd_temp = -1;
571721

722+
if (nope)
723+
return 1;
724+
572725
/* track the effect stanza changes are having on line offsets */
573726
pdp->delta += pdp->post - pdp->pre;
574727

@@ -611,6 +764,7 @@ main(int argc, char *argv[])
611764
dp.d = DSS_WAIT_MMM;
612765
dp.lb.fd = 0; /* stdin */
613766
dp.fd_temp = -1;
767+
dp.li_out = 1;
614768

615769
while (1) {
616770
size_t l = fixdiff_get_line(&dp.lb, in, sizeof(in));

0 commit comments

Comments
 (0)