Merge pull request #1580 from b4n/name-escape

Escape leading spaces in tag names instead of stripping them
universal-ctags · Nov 28, 2017 · 248cffc · 248cffc
2 parents 157918c + 7195e6a
commit 248cffc
Show file tree

Hide file tree

Showing 14 changed files with 86 additions and 62 deletions.
diff --git a/Tmain/broken-tagname.d/stderr-expected.txt b/Tmain/broken-tagname.d/stderr-expected.txt
@@ -3,8 +3,11 @@ Reading command line arguments
 CTagsSelfTest requires a memory stream for input
 OPENING input.cst as CTagsSelfTest language file [new,required]
 Initialize parser: CTagsSelfTest
-Unexpected character (0 < *c && *c < 0x20) included in a tagEntryInfo: one
+Unexpected character 0x0a included in a tagEntryInfo: one
 ofbroken	name
 File: input.cst, Line: 1, Lang: CTagsSelfTest, Kind: b
 Escape the character
+Unexpected character 0x09 included in a tagEntryInfo: \Broken	Context
+File: input.cst, Line: 1, Lang: CTagsSelfTest, Kind: b
+Escape the character
 sorting tag file

diff --git a/Units/parser-html.r/whitespace-prefixed.html.d/expected.tags b/Units/parser-html.r/whitespace-prefixed.html.d/expected.tags
diff --git a/Units/parser-html.r/whitespace-prefixed.html.d/input.html b/Units/parser-html.r/whitespace-prefixed.html.d/input.html
diff --git a/Units/parser-html.r/whitespaces.html.d/args.ctags b/Units/parser-html.r/whitespaces.html.d/args.ctags
@@ -0,0 +1 @@
+--sort=no
diff --git a/Units/parser-html.r/whitespaces.html.d/expected.tags b/Units/parser-html.r/whitespaces.html.d/expected.tags
@@ -0,0 +1,3 @@
+heading1	input.html	/^<\/h1>$/;"	h
+heading2	input.html	/^<h1>  heading2  <\/h1>$/;"	h
+heading 3	input.html	/^<h1>heading  3  <\/h1>$/;"	h
diff --git a/Units/parser-html.r/whitespaces.html.d/input.html b/Units/parser-html.r/whitespaces.html.d/input.html
@@ -0,0 +1,5 @@
+<h1>
+    heading1
+</h1>
+<h1>  heading2  </h1>
+<h1>heading  3  </h1>
diff --git a/Units/parser-javascript.r/js-broken-template.d/expected.tags b/Units/parser-javascript.r/js-broken-template.d/expected.tags
@@ -0,0 +1 @@
+\n	input.js	/^let`$/;"	v
diff --git a/Units/parser-javascript.r/js-odd-method-names.d/args.ctags b/Units/parser-javascript.r/js-odd-method-names.d/args.ctags
@@ -0,0 +1 @@
+--sort=no
diff --git a/Units/parser-javascript.r/js-odd-method-names.d/expected.tags b/Units/parser-javascript.r/js-odd-method-names.d/expected.tags
@@ -0,0 +1,12 @@
+\x21hello	input.js	/^  '!hello': function(){},$/;"	m	class:object
+\x20hello	input.js	/^  ' hello': function(){},$/;"	m	class:object
+<hello	input.js	/^  '<hello': function(){},$/;"	m	class:object
+>hello	input.js	/^  '>hello': function(){},$/;"	m	class:object
+\thello	input.js	/^  '	hello': function(){},$/;"	m	class:object
+\\hello	input.js	/^  '\\\\hello': function(){},$/;"	m	class:object
+;"hello	input.js	/^  ';"hello': function(){},$/;"	m	class:object
+"hello	input.js	/^  '"hello': function(){},$/;"	m	class:object
+'hello	input.js	/^  "'hello": function(){},$/;"	m	class:object
+hello!	input.js	/^  'hello!': function(){},$/;"	m	class:object
+hello 	input.js	/^  'hello ': function(){},$/;"	m	class:object
+object	input.js	/^var object = {$/;"	c
diff --git a/Units/parser-javascript.r/js-odd-method-names.d/input.js b/Units/parser-javascript.r/js-odd-method-names.d/input.js
@@ -0,0 +1,13 @@
+var object = {
+  '!hello': function(){},
+  ' hello': function(){},
+  '<hello': function(){},
+  '>hello': function(){},
+  '	hello': function(){},
+  '\\hello': function(){},
+  ';"hello': function(){},
+  '"hello': function(){},
+  "'hello": function(){},
+  'hello!': function(){},
+  'hello ': function(){},
+};
diff --git a/docs/format.rst b/docs/format.rst
@@ -269,9 +269,14 @@ A tagfield has a name, a colon, and a value: "name:value".
   must be doubled!
 
   EXCEPTION: Universal ctags introduces more conversion rules.
-  The characters in range 0 to 0x20 and 0x7F is converted
-  to \x prefixed hexadecimal number if the characters are not handled
-  in the abouve "value" rules.
+
+  - When a value contains a "\\a", this stands for a <BEL> (0x07).
+  - When a value contains a "\\b", this stands for a <BS> (0x08).
+  - When a value contains a "\\v", this stands for a <VT> (0x0b).
+  - When a value contains a "\\f", this stands for a <FF> (0x0c).
+  - The characters in range 0x01 to 0x1F included, 0x7F, and leading space
+    (0x20) and '!' (0x21) are converted to \x prefixed hexadecimal number if
+    the characters are not handled in the above "value" rules.
 
 Proposed tagfield names:
 
@@ -468,9 +473,10 @@ Exceptions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 #. {tagname} in tags file generated by Universal ctags may contain
-   spaces. Parsers for documents like Tex and reStructuredText need
-   this exceptions. See {tagname} of Proposal section for more detail
-   about the conversion.
+   spaces and several escape sequences. Parsers for documents like Tex and
+   reStructuredText, or liberal languages such as JavaScript need these
+   exceptions. See {tagname} of Proposal section for more detail about the
+   conversion.
 
 .. _compat-output:
 

diff --git a/main/entry.c b/main/entry.c
@@ -1294,25 +1294,8 @@ static void makeTagEntriesForSubwords (tagEntryInfo *const subtag)
 	stringListDelete (list);
 }
 
-static char *trimPrefixedWhitespaces (const char *name)
+extern int makeTagEntry (const tagEntryInfo *const tag)
 {
-	const char *start;
-
-	for (start = name; isspace(*start); start++)
-		;
-
-	if (start != name)
-		return eStrdup (start);
-
-	return NULL;
-}
-
-extern int makeTagEntry (const tagEntryInfo *const tag_const)
-{
-	char *trimmed_name;
-	const tagEntryInfo *tag = tag_const;
-	tagEntryInfo tag_backingstore;
-
 	int r = CORK_NIL;
 	Assert (tag->name != NULL);
 
@@ -1329,14 +1312,6 @@ extern int makeTagEntry (const tagEntryInfo *const tag_const)
 			return CORK_NIL;
 	}
 
-	trimmed_name = trimPrefixedWhitespaces (tag->name);
-	if (trimmed_name)
-	{
-		tag_backingstore = *tag_const;
-		tag = &tag_backingstore;
-		tag_backingstore.name = trimmed_name;
-	}
-
 	if (tag->name [0] == '\0' && (!tag->placeholder))
 	{
 		if (!doesInputLanguageAllowNullTag())
@@ -1356,8 +1331,6 @@ extern int makeTagEntry (const tagEntryInfo *const tag_const)
 		makeTagEntriesForSubwords (&subtag);
 	}
 out:
-	if (trimmed_name)
-		eFree (trimmed_name);
 	return r;
 }
 

diff --git a/main/field.c b/main/field.c
@@ -382,42 +382,52 @@ static const char *renderEscapedString (const char *s,
 	return vStringValue (b);
 }
 
-static const char *renderEscapedName (const char* s,
+static const char *renderEscapedName (const bool isTagName,
+				      const char* s,
 				      const tagEntryInfo *const tag,
 				      vString* b)
 {
-	const char* base = s;
+	int unexpected_byte = 0;
 
-	for (; *s; s++)
+	if (isTagName && (*s == ' ' || *s == '!'))
 	{
-		int c = *s;
-		if ((c > 0x00 && c <= 0x1F) || c == 0x7F)
+		/* Don't allow a leading space or exclamation mark as it conflicts with
+		 * pseudo-tags when sorting.  Anything with a lower byte value is
+		 * escaped by renderEscapedString() already. */
+		unexpected_byte = *s;
+		switch (*s)
 		{
-			const kindDefinition *kdef = getTagKind (tag);
-			verbose ("Unexpected character (0 < *c && *c < 0x20) included in a tagEntryInfo: %s\n", base);
-			verbose ("File: %s, Line: %lu, Lang: %s, Kind: %c\n",
-				 tag->inputFileName, tag->lineNumber, getLanguageName(tag->langType), kdef->letter);
-			verbose ("Escape the character\n");
-			break;
+			case ' ': vStringCatS (b, "\\x20"); s++; break;
+			case '!': vStringCatS (b, "\\x21"); s++; break;
+			default: AssertNotReached();
 		}
-		else if (c == '\\')
-			break;
-		else
-			continue;
 	}
+	else
+	{
+		/* Find the first byte needing escaping for the warning message */
+		const char *p = s;
 
-	if (*s == '\0')
-		return base;
+		while (*p > 0x1F && *p != 0x7F)
+			p++;
+		unexpected_byte = *p;
+	}
 
-	vStringNCatS (b, base, s - base);
+	if (unexpected_byte)
+	{
+		const kindDefinition *kdef = getTagKind (tag);
+		verbose ("Unexpected character %#04x included in a tagEntryInfo: %s\n", unexpected_byte, s);
+		verbose ("File: %s, Line: %lu, Lang: %s, Kind: %c\n",
+			 tag->inputFileName, tag->lineNumber, getLanguageName(tag->langType), kdef->letter);
+		verbose ("Escape the character\n");
+	}
 
 	return renderEscapedString (s, tag, b);
 }
 
 static const char *renderFieldName (const tagEntryInfo *const tag, const char *value CTAGS_ATTR_UNUSED, vString* b,
 									bool *rejected CTAGS_ATTR_UNUSED)
 {
-	return renderEscapedName (tag->name, tag, b);
+	return renderEscapedName (true, tag->name, tag, b);
 }
 
 static const char *renderFieldNameNoEscape (const tagEntryInfo *const tag, const char *value CTAGS_ATTR_UNUSED, vString* b,
@@ -471,7 +481,7 @@ static const char *renderFieldScope (const tagEntryInfo *const tag, const char *
 	const char* scope;
 
 	getTagScopeInformation ((tagEntryInfo *const)tag, NULL, &scope);
-	return scope? renderEscapedName (scope, tag, b): NULL;
+	return scope? renderEscapedName (false, scope, tag, b): NULL;
 }
 
 static const char *renderFieldScopeNoEscape (const tagEntryInfo *const tag, const char *value CTAGS_ATTR_UNUSED, vString* b,
@@ -499,7 +509,7 @@ static const char *renderFieldInherits (const tagEntryInfo *const tag, const cha
 static const char *renderFieldTyperef (const tagEntryInfo *const tag, const char *value CTAGS_ATTR_UNUSED, vString* b,
 									   bool *rejected CTAGS_ATTR_UNUSED)
 {
-	return renderEscapedName (WITH_DEFUALT_VALUE (tag->extensionFields.typeRef [1]), tag, b);
+	return renderEscapedName (false, WITH_DEFUALT_VALUE (tag->extensionFields.typeRef [1]), tag, b);
 }
 
 

diff --git a/parsers/html.c b/parsers/html.c
@@ -178,8 +178,7 @@ static void readTokenText (tokenInfo *const token, bool collectText)
 					c = ' ';
 				if (c != ' ' || lastC != ' ')
 				{
-					if (collectText)
-						vStringPut (token->string, c);
+					vStringPut (token->string, c);
 					lastC = c;
 				}
 			}
@@ -460,6 +459,7 @@ static void readTag (tokenInfo *token, vString *text, int depth)
 						else
 							headingKind = K_HEADING3;
 
+						vStringStripLeading (text);
 						vStringStripTrailing (text);
 						makeSimpleTag (text, headingKind);
 					}