groff-pdfmark
Fork

(オリジナルレポジトリ: Fork元はありません)

コミット

コミットメタ情報

リビジョン	7fb941487470b7d236080e148971ecf149d072d2 (tree)
日時	2024-10-08 03:30:51
作者	Keith Marshall <keith@user...>
コミッター	Keith Marshall

ログメッセージ

Improve filtering of groff special tokens from sanitized text.

* tmac/sanitize.tmac (sanitize:scan.reject): New string; with
default value of "'\&\%\:'", it specifies a list of groff special
tokens, enclosed within a matching pair of arbitrary delimiting
tokens, ("'" in this default case), which are to be filtered out
of sanitized text, and thus discarded from this context.
(sanitize:scan.subst): New string; with default value comprising
the space-separated list of quoted macro arguments, "'\-'-'",
and "'\ \~' '", this specifies a collection of special token
substitution groups, arbitrarily delimited by the "'" token.
(sanitize:scan.filter): New macro; called by...
(sanitize): ...this, it interprets, and applies the effect of...
(sanitize:scan.reject, sanitize:scan.subst): ...these filters.

変更サマリ

modified: tmac/sanitize.tmac (diff)

差分

diff -r c2eeb7bfb7e2 -r 7fb941487470 tmac/sanitize.tmac

--- a/tmac/sanitize.tmac Thu Oct 03 23:25:46 2024 +0100

+++ b/tmac/sanitize.tmac Mon Oct 07 19:30:51 2024 +0100

		@@ -29,6 +29,12 @@
29	29	with this program. If not, see <http://www.gnu.org/licenses/>.
30	30
31	31	..
	32	+.\" Package Entry Point
	33	+.\" -------------------
	34	+.\"
	35	+.\" All access to the capabilities of this package should be directed
	36	+.\" through the sanitize macro; implement it.
	37	+.\"
32	38	.de sanitize
33	39	.\" Usage: .sanitize name text ...
34	40	.\"

		@@ -66,23 +72,12 @@
66	72	. \" have been replaced by the string "^["; if this string has been
67	73	. \" returned, we prepare to handle an escape sequence...
68	74	. \"
69		-. ie '\\*[sanitize:scan.char]'^[' .nr sanitize:skip.count 1
70		-. el .if '\\*[sanitize:scan.char]'\%' \{\
71		-. \" The "\%" escape requires special handling; assuming
72		-. \" that it will never be used to introduce a hyphen into
73		-. \" a PDF outline entry, it may simply be discarded, by
74		-. \" the expedient of handling it as if it is equivalent
75		-. \" to an artificial "\F%" escape sequence.
76		-. \"
77		-. ds sanitize:residual F%\\*[sanitize:residual]
78		-. nr sanitize:residual.length +2
79		-. nr sanitize:skip.count 1
80		-. \}
81		-. ie \\n[sanitize:skip.count] \{\
	75	+. ie '\\*[sanitize:scan.char]'^[' \{\
	76	+. nr sanitize:skip.count 1
	77	+.
82	78	. \" When a possible escape sequence has been detected, we back it
83	79	. \" up, (in case it isn't recognized, and we need to reinstate its
84		-. \" content into the result string), then scan ahead to check for
85		-. \" an identifiable escape sequence...
	80	+. \" content into the result string), then scan ahead...
86	81	. \"
87	82	. rn sanitize:scan.char sanitize:hold
88	83	. sanitize:scan.execute

		@@ -102,14 +97,10 @@
102	97	.
103	98	. \" When the current scan cycle has retrieved a character, which isn't
104	99	. \" part of any possible escape sequence, we simply copy that character
105		-. \" to the result string; map escapes such as "\ ", "\~", and "\0" to
106		-. \" simple ASCII SP, and "\-" to ASCII hyphen-minus, as we go.
	100	+. \" to the result string, while processing any special token removals,
	101	+. \" or substitutions, as we go.
107	102	. \"
108		-. el \{\
109		-. ie '\\*[sanitize:scan.char]'\-' .ds sanitize:scan.char "-\"
110		-. el .sanitize:scan-space-token.remap "\ " "\~" "\0"
111		-. as sanitize:result "\\*[sanitize:scan.char]\"
112		-. \}
	103	+. el .sanitize:scan.filter \\*[sanitize:scan.reject]
113	104	. \}
114	105	.
115	106	.\" Clean up the register space, by deleting all of the string registers,

		@@ -147,21 +138,223 @@
147	138	. if '!*[sanitize:scan.char]'\\' .ds sanitize:scan.char "^[\"
148	139	. ec
149	140	..
150		-.de sanitize:scan-space-token.remap
151		-.\" Usage (internal): .sanitize:scan-space-token.remap space-token ...
	141	+.
	142	+.\" Filters for Removal and Substitution of Special Tokens
	143	+.\" ------------------------------------------------------
152	144	.\"
153		-.\" This internal helper macro maps any groff escape token which becomes
154		-.\" a space, within formatted output, to an ASCII SP in sanitized text.
	145	+.ds sanitize:scan.reject "'\&\%\:'\"
	146	+.ds sanitize:scan.subst ""'\-'-'" "'\ \~' '"\"
	147	+.
	148	+.\" Note that both of these are handled by a common filter processing
	149	+.\" macro, with the REJECT filters being converted to their equivalent
	150	+.\" SUBST filters, each with an empty substitution field.
	151	+.\"
	152	+.de sanitize:scan.filter
	153	+.\" Usage (internal): .sanitize:scan.filter <reject-list> ...
	154	+.\"
	155	+.\" Handler for processing of any input token, when the sanitize
	156	+.\" macro has determined that it does contribute to any part of a
	157	+.\" normal escape sequence; (it may, however, represent any of the
	158	+.\" special escape sequences, which is encoded in copy mode, as a
	159	+.\" single input token).
155	160	.\"
156	161	. while \\n(.$ \{\
157		-. if '\\$1'\\*[sanitize:scan.char]' \{\
158		-. \" The current input character matches one of the specified
159		-. \" space tokens; substitute and return a simple ASCII SP.
	162	+. \" Each passed <reject-list> argument must comprise a string
	163	+. \" of individual tokens, enclosed in a matching pair of other
	164	+. \" arbitrary tokens, which do not otherwise appear in the list;
	165	+. \" convert each to a token substitution group specification,
	166	+. \" in which the substitute text is an empty string.
	167	+. \"
	168	+. length \\$0.arglen \\$1
	169	+. if \\n[\\$0.arglen]>2 \{\
	170	+. \" Note that any valid <reject-list> argument MUST comprise
	171	+. \" AT LEAST two delimiter tokens, and ONE OR MORE additional
	172	+. \" tokens, which form the <reject-list> content; beyond this
	173	+. \" minimum length limitation, the integrity of <reject-list>
	174	+. \" arguments is NOT verified at this stage; the integrity of
	175	+. \" each WILL be checked, only on subsequent evaluation of
	176	+. \" its corresponding substitution group specification.
160	177	. \"
161		-. ds sanitize:scan.char " \"
162		-. return
	178	+. ie d \\$0.arglist .as \\$0.arglist "\\*[\\$0.delimiter]" "\\$1\"
	179	+. el .ds \\$0.arglist ""\\$1\"
	180	+.
	181	+. \" The delimiter token, for the current <reject-list> string,
	182	+. \" is the first token in this specification string; capture
	183	+. \" it, so that it may subsequently be duplicated at the end
	184	+. \" of a copy of the string, so completing the correspondingly
	185	+. \" generated substitution group specification.
	186	+. \"
	187	+. ds \\$0.delimiter "\\$1\"
	188	+. substring \\$0.delimiter 0 0
163	189	. \}
164		-. shift \" try matching the next specified escape, if any.
	190	+. \" Repeat for the next <reject-list> argument, if any.
	191	+. \"
	192	+. shift
	193	+. \}
	194	+.\" When finished conversion of <reject-list> arguments, we have no
	195	+.\" further use for the argument length check...
	196	+.\"
	197	+. rr \\$0.arglen
	198	+.
	199	+.\" ...but we do still need to properly terminate the last of any
	200	+.\" substitution group specifications, which were generated, or if
	201	+.\" none were, then define an empty set of substitution groups.
	202	+.\"
	203	+. ie d \\$0.arglist .as \\$0.arglist "\\*[\\$0.delimiter]"
	204	+. el .ds \\$0.arglist "\"
	205	+.
	206	+.\" Ultimately, hand off the filter activity to the designated
	207	+.\" sub-handler, and clean up the local namespace.
	208	+.\"
	209	+. \\$0.execute \\[\\$0.arglist] \\[sanitize:scan.subst]
	210	+. rm \\$0.arglist \\$0.delimiter
	211	+..
	212	+.de sanitize:scan.filter.execute
	213	+.\" Usage (internal): .sanitize:scan.filter.execute <subst-group> ...
	214	+.\"
	215	+.\" Sub-handler, called by sanitize:scan:filter, to complete matching
	216	+.\" of the current sanitize:scan.char, in the context of all specified
	217	+.\" substitution group arguments.
	218	+.\"
	219	+.\" Initially assuming that no filter specification is applicable...
	220	+.\"
	221	+. nr \\$0.matched 0
	222	+.
	223	+.\" ...check each given filter specification in turn, until we either
	224	+.\" identify one which matches sanitize:scan.char, or we exhaust ALL
	225	+.\" of those given, without finding any match.
	226	+.\"
	227	+. while \\n(.$ \{\
	228	+. \" For each given filter specification, begin by performing a
	229	+. \" rudimentary check, to ensure that it comprises no fewer than
	230	+. \" the minimum number of tokens required for validity.
	231	+. \"
	232	+. length \\$0.arglen \\$1
	233	+. if \\n[\\$0.arglen]>3 \{\
	234	+. \" Any valid substitution group specification MUST comprise
	235	+. \" EXACTLY THREE identical delimiter tokens, and AT LEAST ONE
	236	+. \" additional token, to be matched as a substitution candidate;
	237	+. \" when this minimum length requirement is satisfied, capture
	238	+. \" the delimiter, (which is the first token specified), for
	239	+. \" subsequent matching...
	240	+. \"
	241	+. ds \\$0.delimiter "\\$1\"
	242	+. substring \\$0.delimiter 0 0
	243	+.
	244	+. \" ...and remove it from the specification, the length of which
	245	+. \" must consequently be reduced by one, prior to initiation of a
	246	+. \" progressive scan for a substitutable token match, (which may
	247	+. \" be identified by a phase ZERO invocation of the associated
	248	+. \" evaluation sub-handler macro).
	249	+. \"
	250	+. nr \\$0.arglen -1
	251	+. ds \\$0.specification "\\$1\"
	252	+. \\$0.eval \\$0 0
	253	+.
	254	+. \" Regardless of the resulting phase ZERO evaluation state, a
	255	+. \" further phase ONE evaluation is required, to completely scan
	256	+. \" the filter specification, and subsequently check that it is
	257	+. \" well-formed; to facilitate this, we must clear the residual
	258	+. \" delimiter match indication, (remaining after matching the
	259	+. \" delimiter between the token list and substitution fields),
	260	+. \" so that we may subsequently confirm that a substitution
	261	+. \" field is both present, and properly terminated.
	262	+. \"
	263	+. \" Additionally, when an applicable specification has been
	264	+. \" identified, the original content of sanitize:scan.char MUST
	265	+. \" be cleared, to prepare it for accumulation of the substitute
	266	+. \" text, during the phase ONE scan.
	267	+. \"
	268	+. ds \\$0.try "\"
	269	+. if \\n[\\$0.matched] .ds sanitize:scan.char "\"
	270	+. \\$0.eval \\$0 1
	271	+.
	272	+. \" After completing this phase ONE evaluation, we should again
	273	+. \" have identified a closing delimiter token, with NO residual
	274	+. \" content in the specification string; if this isn't the case,
	275	+. \" then the specification is malformed, so diagnose it.
	276	+. \"
	277	+. ds \\$0.warn "tm sanitize:warning:\\$1: \\\\$* delimiter\"
	278	+. if !'\\[\\$0.try]'\\[\\$0.delimiter]' .\\*[\\$0.warn missing]
	279	+. if \\n[\\$0.arglen] .\\*[\\$0.warn junk after closing]
	280	+.
	281	+. \" After evaluation of each specification, as necessary, clean
	282	+. \" up the local namespace temporary storage.
	283	+. \"
	284	+. rm \\$0.specification \\$0.delimiter \\$0.try \\$0.warn
	285	+. \}
	286	+.
	287	+. \" As soon as we have identified an applicable filter specification,
	288	+. \" then that applies, to the exclusion of all others, so there is no
	289	+. \" need to consider any more...
	290	+. \"
	291	+. ie \\n[\\$0.matched] .shift \\n(.$
	292	+.
	293	+. \" ...but until such a filter has been identifed, simply discard the
	294	+. \" most recently evaluated, and proceed to evaluate the next given,
	295	+. \" if any remain.
	296	+. \"
	297	+. el .shift
	298	+. \}
	299	+.
	300	+.\" When ALL necessary specifications have been evaluated, we may also
	301	+.\" clean up the local namespace temporary counters, and flags.
	302	+.\"
	303	+. rr \\$0.arglen \\$0.matched
	304	+.
	305	+.\" Regardless of whether a substitution has been performed, (even if it
	306	+.\" has been derived from a malformed specification), or not, whatever
	307	+.\" content remains in sanitize:scan.char must be appended to the result
	308	+.\" string, which is to be returned by the sanitize macro.
	309	+.\"
	310	+. as sanitize:result "\\*[sanitize:scan.char]\"
	311	+..
	312	+.de sanitize:scan.filter.execute.eval
	313	+.\" Usage (internal): .sanitize:scan.filter.execute.eval <caller> <phase>
	314	+.\"
	315	+.\" Internal sub-handler for parsing of substitution filter specifications;
	316	+.\" the <caller> argument specifies the name of the calling macro, (which is
	317	+.\" nominally ALWAYS sanitize:scan.filter.execute), while the <phase> value
	318	+.\" is ZERO when scanning the initial token list field, within each filter
	319	+.\" specification, and ONE when parsing the substitution field.
	320	+.\"
	321	+.\" On entry, the caller's ".arglen" property specifies the number of tokens,
	322	+.\" within the active filter specification, which remain to be parsed; the
	323	+.\" caller has already evaluated the first of these, while assigning it to
	324	+.\" its ".delimiter" property...
	325	+.\"
	326	+. while \\n[\\$1.arglen] \{\
	327	+. \"
	328	+. \" ...so, immediately discard it...
	329	+. \"
	330	+. nr \\$1.arglen -1
	331	+. substring \\$1.specification 1
	332	+. \"
	333	+. \" ...and move on, capturing the next available token for evaluation...
	334	+. \"
	335	+. ds \\$1.try "\\*[\\$1.specification]\"
	336	+. substring \\$1.try 0 0
	337	+. \"
	338	+. \" ...terminating this evaluation phase, as soon as a token match for
	339	+. \" the caller's ".delimiter" property is encounered.
	340	+. \"
	341	+. if '\\[\\$1.try]'\\[\\$1.delimiter]' .return
	342	+.
	343	+. \" If we're still here, then we have an new non-delimiter token which
	344	+. \" is to be evaluated; if this is in phase ONE, and the current filter
	345	+. \" has been activated by a prior token match in phase ZERO, then this
	346	+. \" token must be appended to the current sanitize:result...
	347	+. \"
	348	+. ie \\$2 .if \\n[\\$1.matched] .as sanitize:scan.char "\\*[\\$1.try]\"
	349	+.
	350	+. \" ...otherwise, while in evaluation phase ZERO, we check for a token
	351	+. \" match to the current sanitize:scan.char, setting the active state
	352	+. \" for the current filter, on finding such a match, and then continue
	353	+. \" to scan ahead, until a delimiter match terminates the scan, or (in
	354	+. \" the case of a malformed filter specification), no tokens remain to
	355	+. \" be evaluated.
	356	+. \"
	357	+. el .if '\\[\\$1.try]'\\[sanitize:scan.char]' .nr \\$1.matched 1
165	358	. \}
166	359	..
167	360	.de sanitize:skip-(

groff-pdfmark Fork

コミット

タグ

よく使われているワード(クリックで追加)

コミットメタ情報

ログメッセージ

変更サマリ

差分

groff-pdfmark
Fork