Either people are not interested, or Python's not capable, so here's solution in Perl :-). Seriously, as noted above, you don't need to "alter strings". PDF annotations are the solution for you. I had small project with annotations not long ago, some code's from there. But, my content parser was not universal, and you don't need full-blown parsing -- meaning being able to alter content and write it back. Therefore I resorted to external tool. PDF Library I use is somewhat low-level, but I don't mind. It also means, one's expected to have proper knowledge of PDF internals to understand what's going on. Otherwise, just use the tool.
Here's a shot of marking e.g. all gerunds in OP's file with a command
perl pdf_hl.pl -f westlaw.pdf -p 'S*ing'
The code (comment inside worth reading, too):
use strict;
use warnings;
use XML::Simple;
use CAM::PDF;
use Getopt::Long;
use Regexp::Assemble;
#####################################################################
#
# This is PDF highlight mark-up tool.
# Though fully functional, it's still a prototype proof-of-concept.
# Please don't feed it with non-pdf files or patterns like 'd*'
# (because you probably want 'd+', don't you?).
#
# Requires muPDF-tools installed and in the PATH, plus some CPAN modules.
#
# ToDo:
# - error handling is primitive if any.
# - cropped files (CropBox) are processed incorrectly. Fix it.
# - of course there can be other useful parameters.
# - allow loading them from file.
# - allow searching across lines (e.g. for multi-word patterns)
# and certainly across "spans" within a line (see mudraw output).
# - multi-color mark-up, not just yellow.
# - control over output file name.
# - compress output (use cleanoutput method instead of output,
# plus more robust (think compressed object streams) compressors
# may be useful).
# - file list processing.
# - annotations are not just colorful marks on the page, their
# dictionaries can contain all sorts of useful information, which may
# be extracted automatically further up the food chain i.e. by
# whoever consumes these files (date, time, author, comments, actual
# text below, etc., etc., plus think of customized appearence streams,
# placing them on layers, etc..
# - ???
#
# Most complexity in the code comes from adding appearance
# dictionary (AP). You can safely delete it, because most viewers don't
# need AP for standard annotations. Ironically, muPDF-viewer wants it
# (otherwise highlight placement is not 100% correct), and since I relied
# on muPDF-tools, I thought it be proper to create PDFs consumable by
# their viewer... Firefox wants AP too, btw.
#
#####################################################################
my ($file, $csv);
my ($c_flag, $w_flag) = (0, 1);
GetOptions('-f=s' => $file, '-p=s' => $csv,
'-c!' => $c_flag, '-w!' => $w_flag)
and defined($file)
and defined($csv)
or die "
Usage: perl $0 -f FILE -p LIST -c -w
",
"-fFILE PDF file to annotate
",
"-pLIST comma-separated patterns
",
"-c or -noc be case sensitive (default = no)
",
"-w or -now whole words only (default = yes)
";
my $re = Regexp::Assemble->new
->add(split(',', $csv))
->anchor_word($w_flag)
->flags($c_flag ? '' : 'i')
->re;
my $xml = qx/mudraw -ttt $file/;
my $tree = XMLin($xml, ForceArray => [qw/page block line span char/]);
my $pdf = CAM::PDF->new($file);
sub __num_nodes_list {
my $precision = shift;
[ map {CAM::PDF::Node->new('number', sprintf("%.${precision}f", $_))} @_ ]
}
sub add_highlight {
my ($idx, $x1, $y1, $x2, $y2) = @_;
my $p = $pdf->getPage($idx);
# mirror vertically to get to normal cartesian plane
my ($X1, $Y1, $X2, $Y2) = $pdf->getPageDimensions($idx);
($x1, $y1, $x2, $y2) = ($X1 + $x1, $Y2 - $y2, $X1 + $x2, $Y2 - $y1);
# corner radius
my $r = 2;
# AP appearance stream
my $s = "/GS0 gs 1 1 0 rg 1 1 0 RG
";
$s .= "1 j @{[sprintf '%.0f', $r * 2]} w
";
$s .= "0 0 @{[sprintf '%.1f', $x2 - $x1]} ";
$s .= "@{[sprintf '%.1f',$y2 - $y1]} re B
";
my $highlight = CAM::PDF::Node->new('dictionary', {
Subtype => CAM::PDF::Node->new('label', 'Highlight'),
Rect => CAM::PDF::Node->new('array',
__num_nodes_list(1, $x1 - $r, $y1 - $r, $x2 + $r * 2, $y2 + $r * 2)),
QuadPoints => CAM::PDF::Node->new('array',
__num_nodes_list(1, $x1, $y2, $x2, $y2, $x1, $y1, $x2, $y1)),
BS => CAM::PDF::Node->new('dictionary', {
S => CAM::PDF::Node->new('label', 'S'),
W => CAM::PDF::Node->new('number', 0),
}),
Border => CAM::PDF::Node->new('array',
__num_nodes_list(0, 0, 0, 0)),
C => CAM::PDF::Node->new('array',
__num_nodes_list(0, 1, 1, 0)),
AP => CAM::PDF::Node->new('dictionary', {
N => CAM::PDF::Node->new('reference',
$pdf->appendObject(undef,
CAM::PDF::Node->new('object',
CAM::PDF::Node->new('dictionary', {
Subtype => CAM::PDF::Node->new('label', 'Form'),
BBox => CAM::PDF::Node->new('array',
__num_nodes_list(1, -$r, -$r, $x2 - $x1 + $r * 2,
$y2 - $y1 + $r * 2)),
Resources => CAM::PDF::Node->new('dictionary', {
ExtGState => CAM::PDF::Node->new('dictionary', {
GS0 => CAM::PDF::Node->new('dictionary', {
BM => CAM::PDF::Node->new('label',
'Multiply'),
}),
}),
}),
StreamData => CAM::PDF::Node->new('stream', $s),
Length => CAM::PDF::Node->new('number', length $s),
}),
),
,0),
),
}),
});
$p->{Annots} ||= CAM::PDF::Node->new('array', []);
push @{$pdf->getValue($p->{Annots})}, $highlight;
$pdf->{changes}->{$p->{Type}->{objnum}} = 1
}
my $page_index = 1;
for my $page (@{$tree->{page}}) {
for my $block (@{$page->{block}}) {
for my $line (@{$block->{line}}) {
for my $span (@{$line->{span}}) {
my $string = join '', map {$_->{c}} @{$span->{char}};
while ($string =~ /$re/g) {
my ($x1, $y1) =
split ' ', $span->{char}->[$-[0]]->{bbox};
my (undef, undef, $x2, $y2) =
split ' ', $span->{char}->[$+[0] - 1]->{bbox};
add_highlight($page_index, $x1, $y1, $x2, $y2)
}
}
}
}
$page_index ++
}
$pdf->output($file =~ s/(.{4}$)/++$1/r);
__END__
P.s. I tagged the Question with 'Perl', to maybe have some feedback (code corrections, etc.) from community.