diff --git a/doc/flex.texi b/doc/flex.texi index 9087622e6..3059715c1 100644 --- a/doc/flex.texi +++ b/doc/flex.texi @@ -655,6 +655,34 @@ ruleD ECHO; @end verbatim @end example +Flex rejects comments that include an @code{*}, followed by either @code{\} or +@code{??/}, a newline (optionally preceded by whitespace), and finally an +@code{/} (at the start of the next line). In C, this is ends a comment, as the +characters between @code{*} and @code{/} are considered to be an escaped +newline, and escaped newlines are removed before comments are processed. + +Therefore, the following comments are invalid: + +@example +@verbatim +%{ +/* code block *\ +/ +*/ +%} + +/* Definitions Section *??/ +/ + +%% + /* Rules Section *\ +/ +ruleD ECHO; +} +%% +@end verbatim +@end example + @node Patterns, Matching, Format, Top @chapter Patterns @@ -1207,7 +1235,7 @@ Actions can include arbitrary C code, including @code{return} statements to return a value to whatever routine called @code{yylex()}. Each time @code{yylex()} is called it continues processing tokens from where it last left off until it either reaches the end of the file or executes a -return. +return. Flex does impose some minor restrictions on this code. Specifically: @cindex yytext, modification of Actions are free to modify @code{yytext} except for lengthening it @@ -4534,6 +4562,32 @@ option. @code{flex} is fully compatible with @code{lex} with the following exceptions: @itemize +@item +Flex rejects block comments that contain C escaped newlines in their start +and/or end sequences. Earlier versions of Flex would be confused by them, and +most syntax highlighters are confused too. + +@item +Flex rejects code that contains trigraphs, if trigraph expansion could affect +the meaning of the code. Flex does not know whether your C or C++ compiler +processes trigraphs, so it cannot scan your code properly. Trigraphs are +virtually never used, so this problem should be rare. + +@item +Flex understands C++11 raw string literals. Since Flex does not know if you +will compile your C code as C++, Flex may reject valid C input in rare cases. +These cases can be fixed by ensuring that a double-quoted string is separated +by whitespace from any adjacent identifiers. + +@item +Flex understands line comments, as specified by C++ and C99. If your +code has line comments, but your C compiler does not process them, you will get +an error from the C compiler. + +@item +Flex rejects line comments that contain an escaped newline. This is mostly a +source of bugs, and is hardly ever intentional. + @item The undocumented @code{lex} scanner internal variable @code{yylineno} is not supported unless @samp{-l} or @code{%option yylineno} is used. @@ -4698,8 +4752,11 @@ respectively. If the version of @code{flex} being used is a beta version, then the symbol @code{FLEX_BETA} is defined. @item -The symbols @samp{[[} and @samp{]]} in the code sections of the input -may conflict with the m4 delimiters. @xref{M4 Dependency}. +In past versions of Flex, the symbols @samp{[[} and @samp{]]} in the code +sections of the input could conflict with the M4 delimiters. +@xref{M4 Dependency}. This is now fixed, and you can use @samp{[[} and +@samp{]]} freely in your code. If you get any errors from M4 (such as +@code{end of file in string}), please report them as bugs. @end itemize @@ -8389,25 +8446,8 @@ future revisions of flex. It is not part of the public API of flex. Do not depen must be installed wherever flex is installed. @code{flex} invokes @samp{m4}, found by searching the directories in the @code{PATH} environment variable. Any code you place in section 1 or in the -actions will be sent through m4. Please follow these rules to protect your -code from unwanted @code{m4} processing. - -@itemize - -@item Do not use symbols that begin with, @samp{m4_}, such as, @samp{m4_define}, -or @samp{m4_include}, since those are reserved for @code{m4} macro names. If for -some reason you need m4_ as a prefix, use a preprocessor #define to get your -symbol past m4 unmangled. - -@item Do not use the strings @samp{[[} or @samp{]]} anywhere in your code. The -former is not valid in C, except within comments and strings, but the latter is valid in -code such as @code{x[y[z]]}. The solution is simple. To get the literal string -@code{"]]"}, use @code{"]""]"}. To get the array notation @code{x[y[z]]}, -use @code{x[y[z] ]}. Flex will attempt to detect these sequences in user code, and -escape them. However, it's best to avoid this complexity where possible, by -removing such sequences from your code. - -@end itemize +actions will be sent through m4. Flex quotes the code that you have written, +and escapes it as needed, so this does not impose any restrictions on your code. @code{m4} is only required at the time you run @code{flex}. The generated scanner is ordinary C or C++, and does @emph{not} require @code{m4}. diff --git a/src/scan.l b/src/scan.l index f4b44b825..d08e8b652 100644 --- a/src/scan.l +++ b/src/scan.l @@ -43,85 +43,120 @@ extern bool tablesverify, tablesext; extern int trlcontxt; /* Set in parse.y for each rule. */ extern const char *escaped_qstart, *escaped_qend; +static bool write_to_buf; #define M4QSTART "[""[" #define M4QEND "]""]" #define ESCAPED_QSTART "[" M4QEND M4QSTART "[" M4QEND M4QSTART #define ESCAPED_QEND M4QEND "]" M4QSTART M4QEND "]" M4QSTART -#define ACTION_ECHO add_action( yytext ) -#define ACTION_IFDEF(def, should_define) \ - { \ - if ( should_define ) \ - action_define( def, 1 ); \ - } - -#define ACTION_ECHO_QSTART add_action (ESCAPED_QSTART) -#define ACTION_ECHO_QEND add_action (ESCAPED_QEND) +#define ACTION_ECHO \ + do { \ + size_t i = 0, j = (yyleng); \ + const char *buf = (yytext); \ + for (i = 0; i < j; ++i) { \ + if ('\0' == buf[i]) { \ + synerr(_("Fatal error: NUL character in input file")); \ + /* M4 has undefined behavior if it gets a NUL character */ \ + /* Also, add_action() assumes NUL termination. */ \ + FLEX_EXIT(EXIT_FAILURE); \ + } else if ('\n' == buf[i]) { \ + ++linenum; \ + } \ + } \ + if (write_to_buf) \ + buf_strnappend(&top_buf, buf, j); \ + else \ + add_action(buf); \ + } while (0) + +#define ACTION_IFDEF(def, should_define) \ + do \ + if (should_define) \ + action_define(def, 1); \ + while (0) #define ACTION_M4_IFDEF(def, should_define) \ - do{ \ - if ( should_define ) \ - buf_m4_define( &m4defs_buf, def, NULL);\ + do \ + if (should_define) \ + buf_m4_define(&m4defs_buf, def, NULL); \ else \ - buf_m4_undefine( &m4defs_buf, def);\ - } while(0) - -#define MARK_END_OF_PROLOG mark_prolog(); + buf_m4_undefine(&m4defs_buf, def); \ + while (0) + +/* Compare two buffers for equality */ +static bool flex_mem_equal(void *ptr1, size_t size1, void *ptr2, size_t size2) { + if (size1 != size2) + return false; + if (!size1) + return true; + assert(ptr1); + assert(ptr2); + return !memcmp(ptr1, ptr2, size1); +} -#define YY_DECL \ - int flexscan(void) +#define YY_DECL int flexscan(void) -#define RETURNCHAR \ - yylval = (unsigned char) yytext[0]; \ - return CHAR; +#define RETURNCHAR \ + do { \ + yylval = (unsigned char)yytext[0]; \ + return CHAR; \ + } while (1) #define RETURNNAME \ - if(yyleng < MAXLINE) \ - { \ - strncpy( nmstr, yytext, sizeof(nmstr) ); \ + do \ + if (yyleng < MAXLINE) { \ + strncpy(nmstr, yytext, sizeof(nmstr)); \ return NAME; \ - } \ - else \ - do { \ + } else { \ synerr(_("Input line too long\n")); \ FLEX_EXIT(EXIT_FAILURE); \ - } while (0) + } \ + while (0) #define PUT_BACK_STRING(str, start) \ - { size_t i = strlen( str ); \ - while ( i > start ) \ - unput((str)[--i]); \ - } - -#define CHECK_REJECT(str) \ - if ( all_upper( str ) ) \ - reject = true; - -#define CHECK_YYMORE(str) \ - if ( all_lower( str ) ) \ - yymore_used = true; - -#define YY_USER_INIT \ - if ( getenv("POSIXLY_CORRECT") ) \ - posix_compat = true; - -#define START_CODEBLOCK(x) do { \ - /* Emit the needed line directive... */\ + do { \ + const char *_str = str; \ + size_t i = strlen(_str); \ + while (i > start) \ + unput((_str)[--i]); \ + } while (0) + +#define CHECK_REJECT(str) \ + do \ + reject = !!all_upper(str); \ + while (0) + +#define CHECK_YYMORE(str) \ + do \ + yymore_used = !!all_lower(str); \ + while (0) + +#define YY_USER_INIT \ + do \ + posix_compat = !!getenv("POSIXLY_CORRECT"); \ + while (0) + +#define START_CODEBLOCK(x) \ + do { \ + /* Emit the needed line directive... */ \ if (indented_code == false) { \ linenum++; \ line_directive_out(NULL, 1); \ } \ add_action(M4QSTART); \ yy_push_state(CODEBLOCK); \ - if ((indented_code = x)) ACTION_ECHO; \ -} while(0) + if ((indented_code = x)) \ + ACTION_ECHO; \ + } while (0) -#define END_CODEBLOCK do { \ - yy_pop_state();\ +#define END_CODEBLOCK \ + do { \ + yy_pop_state(); \ add_action(M4QEND); \ - if (!indented_code) line_directive_out(NULL, 0);\ -} while (0) + if (!indented_code) \ + line_directive_out(NULL, 0); \ + } while (0) %} @@ -137,17 +172,29 @@ extern const char *escaped_qstart, *escaped_qend; %x COMMENT_DISCARD CODE_COMMENT %x SECT3_NOESCAPE %x CHARACTER_CONSTANT +%x LINE_COMMENT +%x RAW_STRING + + +WS ([[:blank:]]+) +OPTWS ([[:blank:]]*) +NOT_WS ([^[:blank:]\r\n]) -WS [[:blank:]]+ -OPTWS [[:blank:]]* -NOT_WS [^[:blank:]\r\n] +NL (\r?\n) -NL \r?\n +INVALID_ESCAPED_NEWLINE (("\\"|"??/")[\ \t\v\f]*(\r|\n|\r\n)) NAME ([[:alpha:]_][[:alnum:]_-]*) -NOT_NAME [^[:alpha:]_*\n]+ +NOT_NAME ([^[:alpha:]_*\n]+) +IDENT_START_CHAR ([A-Za-z_\x80-\xBF\xC2-\xF4]) +IDENT_CONT_CHAR ([A-Za-z0-9_\x80-\xBF\xC2-\xF4]) +GENERAL_NAME_START ({IDENT_START_CHAR}{INVALID_ESCAPED_NEWLINE}*) +GENERAL_NAME_CONT ({IDENT_CONT_CHAR}{INVALID_ESCAPED_NEWLINE}*) +GENERAL_NAME ({GENERAL_NAME_START}{GENERAL_NAME_CONT}*) +RAW_STRING_START ("R"{INVALID_ESCAPED_NEWLINE}*'"'{INVALID_ESCAPED_NEWLINE}*{GENERAL_NAME}'(') +RAW_STRING_END (")"{IDENT_START_CHAR}{IDENT_CONT_CHAR}*'"') -SCNAME {NAME} +SCNAME ({NAME}) ESCSEQ (\\([^\n]|[0-7]{1,3}|x[[:xdigit:]]{1,2})) @@ -155,10 +202,10 @@ FIRST_CCL_CHAR ([^\\\n]|{ESCSEQ}) CCL_CHAR ([^\\\n\]]|{ESCSEQ}) CCL_EXPR ("[:"^?[[:alpha:]]+":]") -LEXOPT [aceknopr] +LEXOPT ([aceknopr]) -M4QSTART "[""[" -M4QEND "]""]" +M4QSTART ("[""[") +M4QEND ("]""]") %% static int bracelevel, didadef, indented_code; @@ -170,7 +217,7 @@ M4QEND "]""]" char nmdef[MAXLINE]; -{ + { ^{WS} START_CODEBLOCK(true); ^"/*" add_action("/*[""["); yy_push_state( COMMENT ); ^#{OPTWS}line{WS} yy_push_state( LINEDIR ); @@ -182,6 +229,7 @@ M4QEND "]""]" ++linenum; buf_linedir( &top_buf, infilename?infilename:"", linenum); brace_depth = 1; + write_to_buf = true; yy_push_state(CODEBLOCK_MATCH_BRACE); } @@ -226,24 +274,24 @@ M4QEND "]""]" {SCNAME} RETURNNAME; ^{OPTWS}{NL} ++linenum; /* allows blank lines in section 1 */ - {OPTWS}{NL} ACTION_ECHO; ++linenum; /* maybe end of comment line */ + {OPTWS}{NL} ACTION_ECHO; /* maybe end of comment line */ } -{ /* */ - [^\[\]\*\n]* ACTION_ECHO; - . ACTION_ECHO; +"*/" add_action("*/" M4QEND); yy_pop_state(); +"*/" add_action("*/"); yy_pop_state(); - {NL} ++linenum; ACTION_ECHO; -} -{ - "*/" add_action("*/]""]"); yy_pop_state(); -} -{ - "*/" ACTION_ECHO; yy_pop_state(); + { + ([^\[\]\*]|"["[^\[]|"]"[^]])+ ACTION_ECHO; + "*"{INVALID_ESCAPED_NEWLINE}+"/" { + synerr(_("Escaped newline in comment end sequence\n")); + ACTION_ECHO; + if (COMMENT == YYSTATE) + add_action(M4QEND); + } } -{ + { /* This is the same as COMMENT, but is discarded rather than output. */ "*/" yy_pop_state(); "*" ; @@ -268,25 +316,32 @@ M4QEND "]""]" } . /* ignore spurious characters */ } -{ - {M4QSTART} ACTION_ECHO_QSTART; - {M4QEND} ACTION_ECHO_QEND; +{ /* */ + ({M4QSTART}) add_action(ESCAPED_QSTART); + ({M4QEND}) add_action(ESCAPED_QEND); +} + +{ /* */ + (\\)?"??"[=/\'()!<>-] { + synerr(_("Trigraphs are not allowed in code processed by Flex.")); + ACTION_ECHO; + } } -{ +{ /* */ ^"%}".*{NL} ++linenum; END_CODEBLOCK; [^\n%\[\]]* ACTION_ECHO; . ACTION_ECHO; {NL} { - ++linenum; ACTION_ECHO; if ( indented_code ) END_CODEBLOCK; } } -{ + { "}" { if( --brace_depth == 0){ + write_to_buf = false; /* TODO: Matched. */ yy_pop_state(); }else @@ -303,11 +358,9 @@ M4QEND "]""]" buf_strnappend(&top_buf, yytext, yyleng); } + {M4QSTART} buf_strnappend(&top_buf, escaped_qstart, (int) strlen(escaped_qstart)); {M4QEND} buf_strnappend(&top_buf, escaped_qend, (int) strlen(escaped_qend)); - ([^{}\r\n\[\]]+)|[^{}\r\n] { - buf_strnappend(&top_buf, yytext, yyleng); - } <> { linenum = brace_start_line; @@ -507,9 +560,7 @@ M4QEND "]""]" } } - . ACTION_ECHO; - {NL} ++linenum; ACTION_ECHO; - + (.|\n) ACTION_ECHO; <> { mark_prolog(); sectnum = 0; @@ -755,7 +806,7 @@ nmstr[yyleng - 2 - end_is_ws] = '\0'; /* chop trailing brace */ if (lex_compat || posix_compat){ /* Push back the "?#" and treat it like a normal parens. */ yyless(1); - sf_push(); + sf_push(); return '('; } else @@ -826,12 +877,14 @@ nmstr[yyleng - 2 - end_is_ws] = '\0'; /* chop trailing brace */ } { + /* Yes, Flex uses trailing context. */ "^"/[^-\]\n] BEGIN(CCL); return '^'; "^"/("-"|"]") return '^'; . BEGIN(CCL); RETURNCHAR; } { + /* and here too. */ -/[^\]\n] return '-'; [^\]\n] RETURNCHAR; "]" BEGIN(SECT2); return ']'; @@ -904,23 +957,78 @@ nmstr[yyleng - 2 - end_is_ws] = '\0'; /* chop trailing brace */ return '}'; } } - - -{ - {OPTWS}"%}".* bracelevel = 0; - - "/*" ACTION_ECHO; yy_push_state( CODE_COMMENT ); - - { - "reject" { + { + "reject" { ACTION_ECHO; CHECK_REJECT(yytext); } - "yymore" { + "yymore" { ACTION_ECHO; CHECK_YYMORE(yytext); } - } +} + + { + "/"{INVALID_ESCAPED_NEWLINE}+"*" { + synerr(_("Invalid escaped newline in comment-open-sequence")); + ACTION_ECHO; + yy_push_state(CODE_COMMENT); + } + "/*" ACTION_ECHO; yy_push_state(CODE_COMMENT); + + [^[:alpha:]_{}""''/\n\[\]]+ ACTION_ECHO; + {NAME} ACTION_ECHO; + "'"([^''\\\n]|\\.)"'" ACTION_ECHO; /* character constant */ + "'" ACTION_ECHO; yy_push_state(CHARACTER_CONSTANT); + "\"" ACTION_ECHO; yy_push_state(ACTION_STRING); + "/"{INVALID_ESCAPED_NEWLINE}+"/" { + synerr(_("line comment has an escaped newline")); + ACTION_ECHO; + yy_push_state(LINE_COMMENT); + } + "//" ACTION_ECHO; yy_push_state(LINE_COMMENT); + {INVALID_ESCAPED_NEWLINE} ACTION_ECHO; + {RAW_STRING_START} { + if (memchr(yytext, '\n', yyleng)) { + /* We do not handle this case. It is doable, just awkward. + * Also, it should never happen in real code. */ + synerr(_("Fatal error: Escaped newline in raw string start")); + FLEX_EXIT(EXIT_FAILURE); + } + assert(!raw_string_buffer && "Memory leak!"); + assert(yyleng >= 3); + if ((raw_string_buffer_size = yyleng - 3)) { + raw_string_buffer = malloc(raw_string_buffer_size); + if (!raw_string_buffer) + abort(); + memcpy(raw_string_buffer, yytext + 2, raw_string_buffer_size); + } + ACTION_ECHO; + yy_state_push(RAW_STRING); + } +} + + { + {RAW_STRING_END} { + ACTION_ECHO; + assert(yyleng >= 2); + if (flex_mem_equal(yytext + 1, yyleng - 2, raw_string_buffer, + raw_string_buffer_size)) { + /* free(NULL) is harmless */ + free(raw_string_buffer); + raw_string_buffer = NULL; + raw_string_buffer_size = 0; + yy_pop_state(); + } + } + (.|\n)+ { + ACTION_ECHO; + } +} + + { + {OPTWS}"%}".* bracelevel = 0; + . ACTION_ECHO; {NL} { @@ -936,16 +1044,24 @@ nmstr[yyleng - 2 - end_is_ws] = '\0'; /* chop trailing brace */ } } +{ + {INVALID_ESCAPED_NEWLINE} { + synerr(_("Invalid escaped newline in line comment")); + ACTION_ECHO; + ++linenum; + } + {NL} { + linenum++; + ACTION_ECHO; + yy_pop_state(); + } + . ACTION_ECHO; +} /* Reject and YYmore() are checked for above, in PERCENT_BRACE_ACTION */ { "{" ACTION_ECHO; ++bracelevel; "}" ACTION_ECHO; --bracelevel; - [^[:alpha:]_{}""''/\n\[\]]+ ACTION_ECHO; - {NAME} ACTION_ECHO; - "'"([^''\\\n]|\\.)"'" ACTION_ECHO; /* character constant */ - "'" ACTION_ECHO; BEGIN(CHARACTER_CONSTANT); - "\"" ACTION_ECHO; BEGIN(ACTION_STRING); {NL} { ++linenum; ACTION_ECHO; @@ -962,20 +1078,36 @@ nmstr[yyleng - 2 - end_is_ws] = '\0'; /* chop trailing brace */ { [^\[\]""\\\n]+ ACTION_ECHO; - "\"" ACTION_ECHO; BEGIN(ACTION); + "\"" ACTION_ECHO; yy_pop_state(); } { [^\[\]''\\\n]+ ACTION_ECHO; - "'" ACTION_ECHO; BEGIN(ACTION); + "'" ACTION_ECHO; yy_pop_state(); } + { - (\\\n)* ACTION_ECHO; - \\(\\\n)*. ACTION_ECHO; - {NL} ++linenum; ACTION_ECHO; if (bracelevel <= 0) { BEGIN(SECT2); } else { BEGIN(ACTION); } - . ACTION_ECHO; + /* Must come before the rules regarding invalid escaped newlines, + * since those can match the same strings. */ + (\\\n)+ ACTION_ECHO; + \\(\\\n)+. ACTION_ECHO; + \\. ACTION_ECHO; + . ACTION_ECHO; + + /* We check for invalid escaped newlines, since most Flex + * implementations (including old versions of Flex) don't handle them + * properly. */ + ({INVALID_ESCAPED_NEWLINE}+|\\{INVALID_ESCAPED_NEWLINE}+.) { + synerr(_("Invalid escaped newline in action string")); + ACTION_ECHO; + } + + {NL} { + synerr(_("Newline in character constant or string literal")); + ACTION_ECHO; + } } -<> { +<> { synerr( _( "EOF encountered inside an action" ) ); yyterminate(); } @@ -995,6 +1127,8 @@ nmstr[yyleng - 2 - end_is_ws] = '\0'; /* chop trailing brace */ } { + /* We do very little processing in section 3. TODO: do the same validation + * we do elsewhere. */ {M4QSTART} fputs(escaped_qstart, yyout); {M4QEND} fputs(escaped_qend, yyout); [^\[\]]* ECHO; @@ -1004,12 +1138,14 @@ nmstr[yyleng - 2 - end_is_ws] = '\0'; /* chop trailing brace */ yyterminate(); } } + { + /* This mode is used ONLY by the test suite. */ {M4QSTART} fprintf(yyout, "[""[%s]""]", escaped_qstart); {M4QEND} fprintf(yyout, "[""[%s]""]", escaped_qend); [^][]* ECHO; [][] ECHO; - <> { + <> { sectnum = 0; yyterminate(); }