VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/script/scriptlex.cpp

Last change on this file was 108281, checked in by vboxsync, 3 weeks ago

Runtime/RTScriptLex*: Implement support for optionally returning parsed comments (single and multi line) as tokens when enabled in the lexer config, bugref:10321 [scm]

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 49.3 KB
Line 
1/* $Id: scriptlex.cpp 108281 2025-02-19 09:59:01Z vboxsync $ */
2/** @file
3 * IPRT - RTScript* lexer API.
4 */
5
6/*
7 * Copyright (C) 2022-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * The contents of this file may alternatively be used under the terms
26 * of the Common Development and Distribution License Version 1.0
27 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28 * in the VirtualBox distribution, in which case the provisions of the
29 * CDDL are applicable instead of those of the GPL.
30 *
31 * You may elect to license modified versions of this file under the
32 * terms and conditions of either the GPL or the CDDL or both.
33 *
34 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35 */
36
37
38/*********************************************************************************************************************************
39* Header Files *
40*********************************************************************************************************************************/
41#define LOG_GROUP RTLOGGROUP_DEFAULT /// @todo
42#include <iprt/script.h>
43
44#include <iprt/assert.h>
45#include <iprt/ctype.h>
46#include <iprt/err.h>
47#include <iprt/file.h>
48#include <iprt/log.h>
49#include <iprt/mem.h>
50#include <iprt/string.h>
51
52
53/*********************************************************************************************************************************
54* Structures and Typedefs *
55*********************************************************************************************************************************/
56
57/**
58 * Internal lexer state.
59 */
60typedef struct RTSCRIPTLEXINT
61{
62 /** Magic. */
63 uint32_t u32Magic;
64 /** Source position. */
65 RTSCRIPTPOS Pos;
66 /** Current and next token buffer. */
67 RTSCRIPTLEXTOKEN aToks[2];
68 /** Pointer to the current token. */
69 PRTSCRIPTLEXTOKEN pTokCur;
70 /** Pointer to the next token. */
71 PRTSCRIPTLEXTOKEN pTokNext;
72 /** The lexer config. */
73 PCRTSCRIPTLEXCFG pCfg;
74 /** The input reader. */
75 PFNRTSCRIPTLEXRDR pfnReader;
76 /** The destructor callback. */
77 PFNRTSCRIPTLEXDTOR pfnDtor;
78 /** Opaque user data for the reader. */
79 void *pvUser;
80 /** Identifier string cache. */
81 RTSTRCACHE hStrCacheId;
82 /** String literal string cache. */
83 RTSTRCACHE hStrCacheStringLit;
84 /** Comment string cache. */
85 RTSTRCACHE hStrCacheComments;
86 /** Status code from the reader. */
87 int rcRdr;
88 /** Internal error info. */
89 RTERRINFOSTATIC ErrInfo;
90 /** Lexer flags. */
91 uint32_t fFlags;
92 /** Maximum numebr of bytes allocated for temporary storage for literal strings. */
93 size_t cchStrLitMax;
94 /** Pointer to the string buffer for holding the literal string. */
95 char *pszStrLit;
96 /** Pointer to the current input character. */
97 const char *pchCur;
98 /** Offset to start reading the next chunk from. */
99 size_t offBufRead;
100 /** Size of the input buffer. */
101 size_t cchBuf;
102 /** The cached part of the input, variable in size. */
103 char achBuf[1];
104} RTSCRIPTLEXINT;
105/** Pointer to the internal lexer state. */
106typedef RTSCRIPTLEXINT *PRTSCRIPTLEXINT;
107
108
109/** Free the identifier string cache literal on destruction. */
110#define RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE RT_BIT_32(0)
111/** Free the string literal string cache literal on destruction. */
112#define RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE RT_BIT_32(1)
113/** Free the comments string cache literal on destruction. */
114#define RTSCRIPT_LEX_INT_F_STR_CACHE_COMMENTS_FREE RT_BIT_32(2)
115/** End of stream reached. */
116#define RTSCRIPT_LEX_INT_F_EOS RT_BIT_32(3)
117
118
119/*********************************************************************************************************************************
120* Global Variables *
121*********************************************************************************************************************************/
122
123/** Default set of white spaces. */
124static const char *g_szWsDef = " \t";
125/** Default set of newlines. */
126static const char *g_aszNlDef[] =
127{
128 "\n",
129 "\r\n",
130 NULL
131};
132/** Default set of characters allowed for identifiers. */
133static const char *g_aszIdeCharSetDef = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
134
135
136/*********************************************************************************************************************************
137* Internal Functions *
138*********************************************************************************************************************************/
139
140
141/**
142 * Locates the given character in the string, consuming it if found.
143 *
144 * @returns Flag whether the character was found in the string.
145 * @param pThis The lexer state.
146 * @param ch The character to check for.
147 * @param psz The string to check.
148 */
149DECLINLINE(bool) rtScriptLexLocateChInStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz)
150{
151 while ( *psz != '\0'
152 && *psz != ch)
153 psz++;
154
155 if (*psz != '\0')
156 RTScriptLexConsumeCh(pThis);
157
158 return *psz != '\0';
159}
160
161
162/**
163 * Matches the input against the given string starting with the given character, consuming it
164 * if found.
165 *
166 * @returns Flag whether there was a match.
167 * @param pThis The lexer state.
168 * @param ch The character to check start matching.
169 * @param psz The string to match against.
170 * @param pszExclude When the string matched but the input continues
171 * with one of the characters in this string there will
172 * be no match.
173 */
174DECLINLINE(bool) rtScriptLexMatchStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz,
175 const char *pszExclude)
176{
177 bool fMatch = false;
178 if (*psz == ch)
179 {
180 unsigned offPeek = 1;
181
182 psz++;
183 while ( *psz != '\0'
184 && *psz == RTScriptLexPeekCh(pThis, offPeek))
185 {
186 offPeek++;
187 psz++;
188 }
189
190 if (*psz == '\0')
191 {
192 if (pszExclude)
193 {
194 ch = RTScriptLexPeekCh(pThis, offPeek);
195 fMatch = strchr(pszExclude, ch) == NULL;
196 }
197 else
198 fMatch = true;
199 }
200
201 if (fMatch)
202 {
203 /* Match, consume everything. */
204 while (offPeek-- > 0)
205 RTScriptLexConsumeCh(pThis);
206 }
207 }
208
209 return fMatch;
210}
211
212
213/**
214 * Tries to locate a string with the given starting character (+ peeking ahead) in the
215 * given string array (exact match) and consumes the entire substring.
216 *
217 * @returns Flag whether there was a match.
218 * @param pThis The lexer state.
219 * @param ch The character to check for.
220 * @param papsz Pointer to the string array to check for.
221 * @param pidx Where to store the index of the matching substring if found,
222 * optional.
223 */
224DECLINLINE(bool) rtScriptLexLocateSubStrInStrArrayMatchConsume(PRTSCRIPTLEXINT pThis, char ch,
225 const char **papsz, unsigned *pidx)
226{
227 unsigned int idx = 0;
228
229 while ( papsz[idx] != NULL
230 && !rtScriptLexMatchStrConsume(pThis, ch, papsz[idx], NULL))
231 idx++;
232
233 if ( papsz[idx] != NULL
234 && pidx)
235 *pidx = idx;
236
237 return papsz[idx] != NULL;
238}
239
240
241/**
242 * Tries to get an exact match starting with the given character, consuming it when found.
243 *
244 * @returns Flag whether there was a match.
245 * @param pThis The lexer state.
246 * @param ch The character to check for.
247 * @param ppMatch Where to store the exact match on success.
248 */
249DECLINLINE(bool) rtScriptLexLocateExactMatchConsume(PRTSCRIPTLEXINT pThis, char ch, PCRTSCRIPTLEXTOKMATCH *ppMatch)
250{
251 PCRTSCRIPTLEXTOKMATCH pTokMatch = pThis->pCfg->paTokMatches;
252
253 if (pTokMatch)
254 {
255 while ( pTokMatch->pszMatch != NULL
256 && !rtScriptLexMatchStrConsume(pThis, ch, pTokMatch->pszMatch,
257 pTokMatch->fMaybeIdentifier
258 ? g_aszIdeCharSetDef
259 : NULL))
260 pTokMatch++;
261
262 if (pTokMatch->pszMatch != NULL)
263 {
264 *ppMatch = pTokMatch;
265 return true;
266 }
267 }
268
269 return false;
270}
271
272
273DECLINLINE(bool) rtScriptLexIsNewlineConsumeEx(PRTSCRIPTLEXINT pThis, char ch, unsigned *pidx)
274{
275 const char **papszNl = pThis->pCfg->papszNewline ? pThis->pCfg->papszNewline : g_aszNlDef;
276
277 bool fMatched = rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszNl, pidx);
278 if (fMatched)
279 {
280 pThis->Pos.iLine++;
281 pThis->Pos.iCh = 1;
282 }
283
284 return fMatched;
285}
286
287
288DECLINLINE(bool) rtScriptLexIsNewlineConsume(PRTSCRIPTLEXINT pThis, char ch)
289{
290 return rtScriptLexIsNewlineConsumeEx(pThis, ch, NULL);
291}
292
293
294/**
295 * Checks whether the character is the beginning of a multi line comment.
296 *
297 * @returns Flag whether a comment was detected.
298 * @param hScriptLex The lexer state.
299 * @param ch The character to check for.
300 * @param pidxMatch Where to store the index of the matching substring if found,
301 * optional.
302 * @note This consumes the start of the single line comment.
303 */
304DECLINLINE(bool) rtScriptLexIsMultiLineComment(PRTSCRIPTLEXINT pThis, char ch, unsigned *pidxMatch)
305{
306 const char **papszCommentMultiStart = pThis->pCfg->papszCommentMultiStart;
307 if ( papszCommentMultiStart
308 && rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentMultiStart, pidxMatch))
309 return true;
310
311 return false;
312}
313
314
315/**
316 * Checks whether the character is the beginning of a multi line comment, skipping the whole
317 * comment if necessary.
318 *
319 * @returns Flag whether a multi line comment was detected and consumed.
320 * @param hScriptLex The lexer state.
321 * @param ch The character to check for.
322 */
323DECLINLINE(bool) rtScriptLexIsMultiLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
324{
325 unsigned idxComment = 0;
326 if (rtScriptLexIsMultiLineComment(pThis, ch, &idxComment))
327 {
328 /* Look for the matching closing lexeme in the input consuming everything along the way. */
329 const char *pszClosing = pThis->pCfg->papszCommentMultiEnd[idxComment];
330
331 for (;;)
332 {
333 char chTmp = RTScriptLexGetCh(pThis);
334
335 /* Check for new lines explicetly to advance the position information. */
336 if (rtScriptLexIsNewlineConsume(pThis, chTmp))
337 continue;
338
339 /** @todo Not quite correct when there is an end of stream before the closing lexeme.
340 * But doesn't hurt at the moment. */
341 if ( chTmp == '\0'
342 || rtScriptLexMatchStrConsume(pThis, chTmp, pszClosing, NULL))
343 break;
344
345 RTScriptLexConsumeCh(pThis);
346 }
347
348 return true;
349 }
350
351 return false;
352}
353
354
355/**
356 * Checks whether the character is the beginning of a single line comment.
357 *
358 * @returns Flag whether a comment was detected.
359 * @param hScriptLex The lexer state.
360 * @param ch The character to check for.
361 * @param pidxMatch Where to store the index of the matching substring if found,
362 * optional.
363 * @note This consumes the start of the single line comment.
364 */
365DECLINLINE(bool) rtScriptLexIsSingleLineComment(PRTSCRIPTLEXINT pThis, char ch, unsigned *pidxMatch)
366{
367 const char **papszCommentSingleStart = pThis->pCfg->papszCommentSingleStart;
368 if ( papszCommentSingleStart
369 && rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentSingleStart, pidxMatch))
370 return true;
371
372 return false;
373}
374
375
376/**
377 * Checks whether the character is the beginning of a single line comment, skipping the whole
378 * comment if necessary.
379 *
380 * @returns Flag whether a single line comment was detected and consumed.
381 * @param hScriptLex The lexer state.
382 * @param ch The character to check for.
383 */
384DECLINLINE(bool) rtScriptLexIsSingleLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
385{
386 if (rtScriptLexIsSingleLineComment(pThis, ch, NULL))
387 {
388 for (;;)
389 {
390 char chTmp = RTScriptLexGetCh(pThis);
391
392 if ( chTmp == '\0'
393 || rtScriptLexIsNewlineConsume(pThis, chTmp))
394 break;
395
396 RTScriptLexConsumeCh(pThis);
397 }
398
399 return true;
400 }
401
402 return false;
403}
404
405
406/**
407 * Fills the input buffer with source data.
408 *
409 * @returns IPRT status code.
410 * @param pThis The lexer state.
411 */
412static int rtScriptLexFillBuffer(PRTSCRIPTLEXINT pThis)
413{
414 int rc = VINF_SUCCESS;
415 size_t cchToRead = pThis->cchBuf;
416 char *pchRead = &pThis->achBuf[0];
417
418 AssertReturn(!(pThis->fFlags & RTSCRIPT_LEX_INT_F_EOS), VERR_INVALID_STATE);
419
420 /* If there is input left to process move it to the front and fill the remainder. */
421 if ( pThis->pchCur != NULL
422 && pThis->pchCur != &pThis->achBuf[pThis->cchBuf])
423 {
424 cchToRead = pThis->pchCur - &pThis->achBuf[0];
425 /* Move the rest to the front. */
426 size_t const cchLeft = pThis->cchBuf - cchToRead;
427 memmove(&pThis->achBuf[0], pThis->pchCur, cchLeft);
428 pchRead = &pThis->achBuf[0] + cchLeft;
429 }
430
431 if (cchToRead)
432 {
433 pThis->pchCur = &pThis->achBuf[0];
434
435 size_t cchRead = 0;
436 rc = pThis->pfnReader(pThis, pThis->offBufRead, pchRead, cchToRead, &cchRead, pThis->pvUser);
437 if (RT_SUCCESS(rc))
438 {
439 pThis->offBufRead += cchRead;
440 if (rc == VINF_EOF)
441 pThis->fFlags |= RTSCRIPT_LEX_INT_F_EOS;
442 if (cchRead < cchToRead)
443 memset(pchRead + cchRead, 0, cchToRead - cchRead);
444 rc = VINF_SUCCESS;
445 }
446 else
447 pThis->rcRdr = rc;
448 }
449 else
450 rc = VERR_BUFFER_OVERFLOW; /** @todo */
451
452 return rc;
453}
454
455
456/**
457 * Produce an end of stream token.
458 *
459 * @returns nothing.
460 * @param pThis The lexer state.
461 * @param pTok The token to fill.
462 */
463static void rtScriptLexProduceTokEos(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
464{
465 pTok->enmType = RTSCRIPTLEXTOKTYPE_EOS;
466 pTok->PosStart = pThis->Pos;
467 pTok->PosEnd = pThis->Pos;
468}
469
470
471RTDECL(int) RTScriptLexProduceTokError(RTSCRIPTLEX hScriptLex, PRTSCRIPTLEXTOKEN pTok,
472 int rc, const char *pszMsg, ...)
473{
474 PRTSCRIPTLEXINT pThis = hScriptLex;
475
476 va_list va;
477 va_start(va, pszMsg);
478
479 pTok->enmType = RTSCRIPTLEXTOKTYPE_ERROR;
480 pTok->PosEnd = pThis->Pos;
481 pTok->Type.Error.pErr = &pThis->ErrInfo.Core;
482
483 RTErrInfoInitStatic(&pThis->ErrInfo);
484 RTErrInfoSetV(&pThis->ErrInfo.Core, rc, pszMsg, va);
485 va_end(va);
486
487 return rc;
488}
489
490
491RTDECL(int) RTScriptLexProduceTokIde(RTSCRIPTLEX hScriptLex, PRTSCRIPTLEXTOKEN pTok, const char *pszIde, size_t cchIde)
492{
493 PRTSCRIPTLEXINT pThis = hScriptLex;
494
495 /* Insert into string cache. */
496 pTok->enmType = RTSCRIPTLEXTOKTYPE_IDENTIFIER;
497 pTok->Type.Id.pszIde = RTStrCacheEnterN(pThis->hStrCacheId, pszIde, cchIde);
498 if (RT_UNLIKELY(!pTok->Type.Id.pszIde))
499 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory inserting identifier into string cache");
500
501 pTok->PosEnd = pThis->Pos;
502 return VINF_SUCCESS;
503}
504
505
506/**
507 * Creates a single line comment token.
508 *
509 * @returns Flag whether a matching rule was found.
510 * @param pThis The lexer state.
511 * @param idxComment The index into the single line comment token start array.
512 * @param pTok The token to fill.
513 */
514static void rtScriptLexProduceTokFromSingleLineComment(PRTSCRIPTLEXINT pThis, unsigned idxComment, PRTSCRIPTLEXTOKEN pTok)
515{
516 const char *pszCommentSingleStart = pThis->pCfg->papszCommentSingleStart[idxComment];
517 AssertPtr(pszCommentSingleStart);
518
519 pTok->PosStart = pThis->Pos;
520
521 /** @todo Optimize */
522 size_t cchTmp = 512;
523 char *pszTmp = (char *)RTMemAlloc(cchTmp);
524 if (pszTmp)
525 {
526 size_t cchComment = 0;
527 while (*pszCommentSingleStart != '\0')
528 pszTmp[cchComment++] = *pszCommentSingleStart++;
529
530 for (;;)
531 {
532 char chTmp = RTScriptLexGetCh(pThis);
533
534 if ( chTmp == '\0'
535 || rtScriptLexIsNewlineConsume(pThis, chTmp))
536 {
537 pszTmp[cchComment++] = '\0';
538 break;
539 }
540
541 if (cchComment == cchTmp - 1)
542 {
543 char *pszNew = (char *)RTMemRealloc(pszTmp, cchTmp + 512);
544 if (!pszNew)
545 {
546 RTMemFree(pszTmp);
547 pszTmp = NULL;
548 RTScriptLexProduceTokError(pThis, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory allocating temporary memory for a single line comment");
549 break;
550 }
551
552 cchTmp += 512;
553 pszTmp = pszNew;
554 }
555
556 pszTmp[cchComment++] = chTmp;
557 RTScriptLexConsumeCh(pThis);
558 }
559
560 if (pszTmp)
561 {
562 pTok->enmType = RTSCRIPTLEXTOKTYPE_COMMENT_SINGLE_LINE;
563 pTok->PosEnd = pThis->Pos;
564 pTok->Type.Comment.pszComment = RTStrCacheEnterN(pThis->hStrCacheId, pszTmp, cchComment);
565 pTok->Type.Comment.cchComment = cchComment;
566 if (RT_UNLIKELY(!pTok->Type.Comment.pszComment))
567 RTScriptLexProduceTokError(pThis, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory inserting comment into comment cache");
568
569 RTMemFree(pszTmp);
570 }
571 }
572 else
573 RTScriptLexProduceTokError(pThis, pTok, VERR_NO_MEMORY, "Lexer: Out of memory allocating temporary memory for a single line comment");
574}
575
576
577/**
578 * Ensures there is enough space in the given buffer for the given amount of bytes,
579 * extending the buffer or creating an error token if this fails.
580 *
581 * @returns Flag whether there is enough space in the buffer.
582 * @param pThis The lexer state.
583 * @param ppchTmp Pointer to the pointer for the character buffer being checked.
584 * On successful return this might contain a different pointer if
585 * re-allocation was required.
586 * @param pcchTmp On input the size of the buffer in characters, on return the new
587 * size of the buffer if re-allocation was required.
588 * @param cchCur How much of the current buffer is used.
589 * @param cchAdd How many additional characters are required.
590 * @param pTok The token to fill in if re-allocating the buffer failed.
591 */
592DECLINLINE(bool) rtScriptLexEnsureTmpBufSpace(PRTSCRIPTLEXINT pThis, char **ppchTmp, size_t *pcchTmp,
593 size_t cchCur, size_t cchAdd, PRTSCRIPTLEXTOKEN pTok)
594{
595 if (RT_LIKELY(cchCur + cchAdd + 1 <= *pcchTmp)) /* Always keep room for the zero terminator. */
596 return true;
597
598 size_t cchNew = *pcchTmp + _1K;
599 char *pchNew = (char *)RTMemRealloc(*ppchTmp, cchNew);
600 if (!pchNew)
601 {
602 RTMemFree(*ppchTmp);
603 *ppchTmp = NULL;
604 RTScriptLexProduceTokError(pThis, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory allocating temporary memory for a multi line comment");
605 return false;
606 }
607
608 *ppchTmp = pchNew;
609 *pcchTmp = cchNew;
610 return true;
611}
612
613
614/**
615 * Creates a multi line comment token.
616 *
617 * @returns Flag whether a matching rule was found.
618 * @param pThis The lexer state.
619 * @param idxComment The index into the single line comment token start array.
620 * @param pTok The token to fill.
621 */
622static void rtScriptLexProduceTokFromMultiLineComment(PRTSCRIPTLEXINT pThis, unsigned idxComment, PRTSCRIPTLEXTOKEN pTok)
623{
624 const char *pszCommentMultiStart = pThis->pCfg->papszCommentMultiStart[idxComment];
625 AssertPtr(pszCommentMultiStart);
626
627 pTok->PosStart = pThis->Pos;
628
629 /** @todo Optimize */
630 size_t cchTmp = _1K;
631 char *pszTmp = (char *)RTMemAlloc(cchTmp);
632 if (pszTmp)
633 {
634 /* Look for the matching closing lexeme in the input consuming everything along the way. */
635 const char *pszClosing = pThis->pCfg->papszCommentMultiEnd[idxComment];
636
637 size_t cchComment = 0;
638 while (*pszCommentMultiStart != '\0')
639 pszTmp[cchComment++] = *pszCommentMultiStart++;
640
641 for (;;)
642 {
643 char chTmp = RTScriptLexGetCh(pThis);
644
645 /* Check for new lines explicetly to advance the position information and copy it over. */
646 unsigned idxNewLine = 0;
647 if (rtScriptLexIsNewlineConsumeEx(pThis, chTmp, &idxNewLine))
648 {
649 const char *pszNl = pThis->pCfg->papszNewline
650 ? pThis->pCfg->papszNewline[idxNewLine]
651 : g_aszNlDef[idxNewLine];
652 if (!rtScriptLexEnsureTmpBufSpace(pThis, &pszTmp, &cchTmp, cchComment,
653 strlen(pszNl), pTok))
654 break;
655
656 while (*pszNl != '\0')
657 pszTmp[cchComment++] = *pszNl++;
658 continue;
659 }
660
661 /* Check for the closing lexeme. */
662 if (rtScriptLexMatchStrConsume(pThis, chTmp, pszClosing, NULL))
663 {
664 /* Copy over the closing comment lexeme. */
665 if (rtScriptLexEnsureTmpBufSpace(pThis, &pszTmp, &cchTmp, cchComment,
666 strlen(pszClosing), pTok))
667 {
668 while (*pszClosing != '\0')
669 pszTmp[cchComment++] = *pszClosing++;
670 pszTmp[cchComment++] = '\0';
671 }
672 break;
673 }
674
675 if (chTmp == '\0')
676 break; /* End of stream before closing lexeme. */
677
678 if (!rtScriptLexEnsureTmpBufSpace(pThis, &pszTmp, &cchTmp, cchComment,
679 strlen(pszClosing), pTok))
680 break;
681
682 pszTmp[cchComment++] = chTmp;
683 RTScriptLexConsumeCh(pThis);
684 }
685
686 if (pszTmp)
687 {
688 pTok->enmType = RTSCRIPTLEXTOKTYPE_COMMENT_MULTI_LINE;
689 pTok->PosEnd = pThis->Pos;
690 pTok->Type.Comment.pszComment = RTStrCacheEnterN(pThis->hStrCacheId, pszTmp, cchComment);
691 pTok->Type.Comment.cchComment = cchComment;
692 if (RT_UNLIKELY(!pTok->Type.Comment.pszComment))
693 RTScriptLexProduceTokError(pThis, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory inserting comment into comment cache");
694
695 RTMemFree(pszTmp);
696 }
697 }
698 else
699 RTScriptLexProduceTokError(pThis, pTok, VERR_NO_MEMORY, "Lexer: Out of memory allocating temporary memory for a multi line comment");
700}
701
702
703/**
704 * Create the token from the exact match.
705 *
706 * @returns nothing.
707 * @param pThis The lexer state.
708 * @param pTok The token to fill.
709 * @param pMatch The matched string.
710 */
711static void rtScriptLexProduceTokFromExactMatch(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok,
712 PCRTSCRIPTLEXTOKMATCH pMatch)
713{
714 pTok->enmType = pMatch->enmTokType;
715 pTok->PosEnd = pThis->Pos;
716
717 switch (pTok->enmType)
718 {
719 case RTSCRIPTLEXTOKTYPE_OPERATOR:
720 pTok->Type.Operator.pOp = pMatch;
721 break;
722 case RTSCRIPTLEXTOKTYPE_KEYWORD:
723 pTok->Type.Keyword.pKeyword = pMatch;
724 break;
725 case RTSCRIPTLEXTOKTYPE_PUNCTUATOR:
726 pTok->Type.Punctuator.pPunctuator = pMatch;
727 break;
728 default:
729 RTScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
730 "Lexer: The match contains an invalid token type: %d\n",
731 pTok->enmType);
732 }
733}
734
735
736/**
737 * Goes through the rules trying to find a matching one.
738 *
739 * @returns Flag whether a matching rule was found.
740 * @param pThis The lexer state.
741 * @param ch The character to check.
742 * @param pTok The token to fill.
743 */
744static bool rtScriptLexProduceTokFromRules(PRTSCRIPTLEXINT pThis, char ch, PRTSCRIPTLEXTOKEN pTok)
745{
746 PCRTSCRIPTLEXRULE pRule = pThis->pCfg->paRules;
747
748 if (pRule)
749 {
750 while (pRule->pfnProd != NULL)
751 {
752 if ( ch >= pRule->chStart
753 && ch <= pRule->chEnd)
754 {
755 if (pRule->fFlags & RTSCRIPT_LEX_RULE_CONSUME)
756 RTScriptLexConsumeCh(pThis);
757 int rc = pRule->pfnProd(pThis, ch, pTok, pRule->pvUser);
758 AssertRC(rc);
759 return true;
760 }
761
762 pRule++;
763 }
764 }
765
766 return false;
767}
768
769
770/**
771 * Fills in the given token from the scanned input at the current location.
772 *
773 * @returns IPRT status code.
774 * @param pThis The lexer state.
775 * @param pTok The token to fill.
776 */
777static int rtScriptLexProduceToken(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
778{
779 RTScriptLexSkipWhitespace(pThis);
780
781 pTok->PosStart = pThis->Pos;
782
783 char ch = RTScriptLexGetCh(pThis);
784 PCRTSCRIPTLEXTOKMATCH pMatch = NULL;
785 unsigned idxComment = 0;
786 if (ch == '\0')
787 rtScriptLexProduceTokEos(pThis, pTok);
788 else if ( (pThis->pCfg->fFlags & RTSCRIPT_LEX_CFG_F_COMMENTS_AS_TOKENS)
789 && rtScriptLexIsSingleLineComment(pThis, ch, &idxComment))
790 rtScriptLexProduceTokFromSingleLineComment(pThis, idxComment, pTok);
791 else if ( (pThis->pCfg->fFlags & RTSCRIPT_LEX_CFG_F_COMMENTS_AS_TOKENS)
792 && rtScriptLexIsMultiLineComment(pThis, ch, &idxComment))
793 rtScriptLexProduceTokFromMultiLineComment(pThis, idxComment, pTok);
794 else if (rtScriptLexLocateExactMatchConsume(pThis, ch, &pMatch))
795 rtScriptLexProduceTokFromExactMatch(pThis, pTok, pMatch);
796 else if (!rtScriptLexProduceTokFromRules(pThis, ch, pTok))
797 {
798 if (pThis->pCfg->pfnProdDef)
799 pThis->rcRdr = pThis->pCfg->pfnProdDef(pThis, ch, pTok, pThis->pCfg->pvProdDefUser);
800 else
801 RTScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
802 "Lexer: Invalid character found in input: %c\n",
803 ch);
804 }
805
806 return pThis->rcRdr;
807}
808
809
810/**
811 * Populates the lexer for the initial use.
812 *
813 * @returns IPRT status code.
814 * @param pThis The lexer state.
815 */
816static int rtScriptLexPopulate(PRTSCRIPTLEXINT pThis)
817{
818 int rc = rtScriptLexFillBuffer(pThis);
819 if (RT_SUCCESS(rc))
820 {
821 rc = rtScriptLexProduceToken(pThis, pThis->pTokCur);
822 if (RT_SUCCESS(rc))
823 rc = rtScriptLexProduceToken(pThis, pThis->pTokNext);
824 }
825
826 return rc;
827}
828
829
830
831RTDECL(int) RTScriptLexCreateFromReader(PRTSCRIPTLEX phScriptLex, PFNRTSCRIPTLEXRDR pfnReader,
832 PFNRTSCRIPTLEXDTOR pfnDtor, void *pvUser,
833 size_t cchBuf, PRTSTRCACHE phStrCacheId, PRTSTRCACHE phStrCacheStringLit,
834 PRTSTRCACHE phStrCacheComments, PCRTSCRIPTLEXCFG pCfg)
835{
836 AssertPtrReturn(phScriptLex, VERR_INVALID_POINTER);
837 AssertPtrReturn(pfnReader, VERR_INVALID_POINTER);
838 AssertPtrReturn(pCfg, VERR_INVALID_POINTER);
839
840 /* Case insensitivity with internal lower or upper case conversion is mutually exclusive. */
841 AssertReturn( (pCfg->fFlags & (RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_LOWER | RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_UPPER))
842 != (RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_LOWER | RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_UPPER), VERR_INVALID_PARAMETER);
843
844 if (!cchBuf)
845 cchBuf = _16K;
846 int rc = VINF_SUCCESS;
847 PRTSCRIPTLEXINT pThis = (PRTSCRIPTLEXINT)RTMemAllocZ(RT_UOFFSETOF_DYN(RTSCRIPTLEXINT, achBuf[cchBuf]));
848 if (RT_LIKELY(pThis))
849 {
850 pThis->u32Magic = 0xfefecafe; /** @todo */
851 pThis->Pos.iLine = 1;
852 pThis->Pos.iCh = 1;
853 pThis->pTokCur = &pThis->aToks[0];
854 pThis->pTokNext = &pThis->aToks[1];
855 pThis->pCfg = pCfg;
856 pThis->pfnReader = pfnReader;
857 pThis->pfnDtor = pfnDtor;
858 pThis->pvUser = pvUser;
859 pThis->fFlags = 0;
860 pThis->cchStrLitMax = 0;
861 pThis->pszStrLit = NULL;
862 pThis->cchBuf = cchBuf;
863 pThis->offBufRead = 0;
864 pThis->pchCur = NULL;
865 pThis->hStrCacheId = NULL;
866 pThis->hStrCacheStringLit = NULL;
867 pThis->hStrCacheComments = NULL;
868
869 if (pCfg->fFlags & RTSCRIPT_LEX_CFG_F_COMMENTS_AS_TOKENS)
870 rc = RTStrCacheCreate(&pThis->hStrCacheComments, "LEX-Comments");
871
872 rc = RTStrCacheCreate(&pThis->hStrCacheId, "LEX-Ide");
873 if (RT_SUCCESS(rc))
874 {
875 rc = RTStrCacheCreate(&pThis->hStrCacheStringLit, "LEX-StrLit");
876 if (RT_SUCCESS(rc))
877 {
878 rc = rtScriptLexPopulate(pThis);
879 if (RT_SUCCESS(rc))
880 {
881 *phScriptLex = pThis;
882
883 if (phStrCacheId)
884 *phStrCacheId = pThis->hStrCacheId;
885 else
886 pThis->fFlags |= RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE;
887
888 if (phStrCacheStringLit)
889 *phStrCacheStringLit = pThis->hStrCacheStringLit;
890 else
891 pThis->fFlags |= RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE;
892
893 if (pCfg->fFlags & RTSCRIPT_LEX_CFG_F_COMMENTS_AS_TOKENS)
894 {
895 if (phStrCacheComments)
896 *phStrCacheComments = pThis->hStrCacheComments;
897 else
898 pThis->fFlags |= RTSCRIPT_LEX_INT_F_STR_CACHE_COMMENTS_FREE;
899 }
900
901 return VINF_SUCCESS;
902 }
903
904 RTStrCacheDestroy(pThis->hStrCacheStringLit);
905 }
906
907 RTStrCacheDestroy(pThis->hStrCacheId);
908 }
909
910 if (pThis->hStrCacheComments)
911 RTStrCacheDestroy(pThis->hStrCacheComments);
912 RTMemFree(pThis);
913 }
914 else
915 rc = VERR_NO_MEMORY;
916
917 return rc;
918}
919
920
921/**
922 * @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a string.}
923 */
924static DECLCALLBACK(int) rtScriptLexReaderStr(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
925 size_t cchBuf, size_t *pcchRead, void *pvUser)
926{
927 RT_NOREF(hScriptLex);
928
929 const char *psz = (const char *)pvUser;
930 size_t cch = strlen(psz);
931 size_t cchCopy = RT_MIN(cchBuf, cch - offBuf);
932 int rc = VINF_SUCCESS;
933
934 *pcchRead = cchCopy;
935
936 if (cchCopy)
937 memcpy(pchCur, &psz[offBuf], cchCopy * sizeof(char));
938 else
939 rc = VINF_EOF;
940
941 return rc;
942}
943
944
945RTDECL(int) RTScriptLexCreateFromString(PRTSCRIPTLEX phScriptLex, const char *pszSrc, PRTSTRCACHE phStrCacheId,
946 PRTSTRCACHE phStrCacheStringLit, PRTSTRCACHE phStrCacheComments, PCRTSCRIPTLEXCFG pCfg)
947{
948 return RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderStr, NULL, (void *)pszSrc, 0,
949 phStrCacheId, phStrCacheStringLit, phStrCacheComments, pCfg);
950}
951
952
953/**
954 * @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a file.}
955 */
956static DECLCALLBACK(int) rtScriptLexReaderFile(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
957 size_t cchBuf, size_t *pcchRead, void *pvUser)
958{
959 RT_NOREF(hScriptLex);
960
961 RTFILE hFile = (RTFILE)pvUser;
962 return RTFileReadAt(hFile, offBuf, pchCur, cchBuf, pcchRead);
963}
964
965
966/**
967 * @callback_method_impl{FNRTSCRIPTLEXDTOR, Destructor for the file variant.}
968 */
969static DECLCALLBACK(void) rtScriptLexDtorFile(RTSCRIPTLEX hScriptLex, void *pvUser)
970{
971 RT_NOREF(hScriptLex);
972
973 RTFILE hFile = (RTFILE)pvUser;
974 RTFileClose(hFile);
975}
976
977
978RTDECL(int) RTScriptLexCreateFromFile(PRTSCRIPTLEX phScriptLex, const char *pszFilename, PRTSTRCACHE phStrCacheId,
979 PRTSTRCACHE phStrCacheStringLit, PRTSTRCACHE phStrCacheComments, PCRTSCRIPTLEXCFG pCfg)
980{
981 RTFILE hFile;
982 int rc = RTFileOpen(&hFile, pszFilename, RTFILE_O_READ | RTFILE_O_DENY_WRITE | RTFILE_O_OPEN);
983 if (RT_SUCCESS(rc))
984 {
985 rc = RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderFile, rtScriptLexDtorFile, (void *)hFile, 0,
986 phStrCacheId, phStrCacheStringLit, phStrCacheComments, pCfg);
987 if (RT_FAILURE(rc))
988 RTFileClose(hFile);
989 }
990
991 return rc;
992}
993
994
995RTDECL(void) RTScriptLexDestroy(RTSCRIPTLEX hScriptLex)
996{
997 PRTSCRIPTLEXINT pThis = hScriptLex;
998 AssertPtrReturnVoid(pThis);
999
1000 if (pThis->pfnDtor)
1001 pThis->pfnDtor(pThis, pThis->pvUser);
1002
1003 if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE)
1004 RTStrCacheDestroy(pThis->hStrCacheId);
1005 if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE)
1006 RTStrCacheDestroy(pThis->hStrCacheStringLit);
1007 if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_COMMENTS_FREE)
1008 RTStrCacheDestroy(pThis->hStrCacheComments);
1009
1010 if (pThis->pszStrLit)
1011 RTStrFree(pThis->pszStrLit);
1012
1013 RTMemFree(pThis);
1014}
1015
1016
1017RTDECL(int) RTScriptLexQueryToken(RTSCRIPTLEX hScriptLex, PCRTSCRIPTLEXTOKEN *ppToken)
1018{
1019 PRTSCRIPTLEXINT pThis = hScriptLex;
1020 AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
1021 AssertPtrReturn(ppToken, VERR_INVALID_POINTER);
1022
1023 if (RT_SUCCESS(pThis->rcRdr))
1024 *ppToken = pThis->pTokCur;
1025
1026 return pThis->rcRdr;
1027}
1028
1029
1030RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexGetTokenType(RTSCRIPTLEX hScriptLex)
1031{
1032 PRTSCRIPTLEXINT pThis = hScriptLex;
1033 AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
1034
1035 if (RT_SUCCESS(pThis->rcRdr))
1036 return pThis->pTokCur->enmType;
1037
1038 return RTSCRIPTLEXTOKTYPE_INVALID;
1039}
1040
1041
1042RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexPeekNextTokenType(RTSCRIPTLEX hScriptLex)
1043{
1044 PRTSCRIPTLEXINT pThis = hScriptLex;
1045 AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
1046
1047 if (RT_SUCCESS(pThis->rcRdr))
1048 return pThis->pTokNext->enmType;
1049
1050 return RTSCRIPTLEXTOKTYPE_INVALID;
1051}
1052
1053
1054RTDECL(PCRTSCRIPTLEXTOKEN) RTScriptLexConsumeToken(RTSCRIPTLEX hScriptLex)
1055{
1056 PRTSCRIPTLEXINT pThis = hScriptLex;
1057 AssertPtrReturn(pThis, NULL);
1058
1059 /*
1060 * Stop token production as soon as the current token indicates the
1061 * end of the stream or an error
1062 */
1063 if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
1064 && pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
1065 {
1066 PRTSCRIPTLEXTOKEN pTokTmp = pThis->pTokCur;
1067
1068 /* Switch next token to current token and read in the next token. */
1069 pThis->pTokCur = pThis->pTokNext;
1070 pThis->pTokNext = pTokTmp;
1071 if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
1072 && pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
1073 rtScriptLexProduceToken(pThis, pThis->pTokNext);
1074 else
1075 pThis->pTokNext = pThis->pTokCur;
1076 }
1077
1078 return pThis->pTokCur;
1079}
1080
1081
1082RTDECL(char) RTScriptLexConsumeCh(RTSCRIPTLEX hScriptLex)
1083{
1084 return RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_DEFAULT);
1085}
1086
1087
1088RTDECL(char) RTScriptLexConsumeChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
1089{
1090 PRTSCRIPTLEXINT pThis = hScriptLex;
1091 AssertPtrReturn(pThis, '\0');
1092
1093 pThis->pchCur++;
1094 pThis->Pos.iCh++;
1095 if (pThis->pchCur == &pThis->achBuf[pThis->cchBuf])
1096 rtScriptLexFillBuffer(pThis);
1097
1098 return RTScriptLexGetChEx(pThis, fFlags);
1099}
1100
1101
1102RTDECL(char) RTScriptLexPeekCh(RTSCRIPTLEX hScriptLex, unsigned idx)
1103{
1104 return RTScriptLexPeekChEx(hScriptLex, idx, RTSCRIPT_LEX_CONV_F_DEFAULT);
1105}
1106
1107
1108RTDECL(char) RTScriptLexPeekChEx(RTSCRIPTLEX hScriptLex, unsigned idx, uint32_t fFlags)
1109{
1110 PRTSCRIPTLEXINT pThis = hScriptLex;
1111 AssertPtrReturn(pThis, '\0');
1112
1113 /* Try to fill up the input buffer if peeking would overflow it. */
1114 if (pThis->pchCur + idx >= &pThis->achBuf[pThis->cchBuf])
1115 rtScriptLexFillBuffer(pThis);
1116
1117 /* Just return the character if it is in the current buffer. */
1118 char ch = '\0';
1119 if (RT_LIKELY(pThis->pchCur + idx < &pThis->achBuf[pThis->cchBuf]))
1120 ch = pThis->pchCur[idx];
1121 else
1122 {
1123 /* Slow path, read data into temporary buffer to read character from and dismiss. */
1124 /** @todo */
1125 AssertReleaseFailed();
1126 }
1127
1128 if (!(fFlags & RTSCRIPT_LEX_CONV_F_NOTHING))
1129 {
1130 if (pThis->pCfg->fFlags & RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_LOWER)
1131 ch = RT_C_TO_LOWER(ch);
1132 else if (pThis->pCfg->fFlags & RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_UPPER)
1133 ch = RT_C_TO_UPPER(ch);
1134 }
1135
1136 return ch;
1137}
1138
1139
1140RTDECL(char) RTScriptLexGetCh(RTSCRIPTLEX hScriptLex)
1141{
1142 return RTScriptLexPeekCh(hScriptLex, 0);
1143}
1144
1145
1146RTDECL(char) RTScriptLexGetChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
1147{
1148 return RTScriptLexPeekChEx(hScriptLex, 0, fFlags);
1149}
1150
1151
1152RTDECL(void) RTScriptLexSkipWhitespace(RTSCRIPTLEX hScriptLex)
1153{
1154 PRTSCRIPTLEXINT pThis = hScriptLex;
1155 AssertPtrReturnVoid(pThis);
1156
1157 for (;;)
1158 {
1159 char ch = RTScriptLexGetCh(hScriptLex);
1160
1161 if (ch == '\0')
1162 break;
1163
1164 /* Check for whitespace. */
1165 const char *pszWs = pThis->pCfg->pszWhitespace ? pThis->pCfg->pszWhitespace : g_szWsDef;
1166
1167 if ( rtScriptLexLocateChInStrConsume(pThis, ch, pszWs)
1168 || rtScriptLexIsNewlineConsume(pThis, ch))
1169 continue;
1170
1171 if ( !(pThis->pCfg->fFlags & RTSCRIPT_LEX_CFG_F_COMMENTS_AS_TOKENS)
1172 && ( rtScriptLexIsMultiLineCommentConsume(pThis, ch)
1173 || rtScriptLexIsSingleLineCommentConsume(pThis, ch)))
1174 continue;
1175
1176 /* All white space skipped, next is some real content. */
1177 break;
1178 }
1179}
1180
1181
1182RTDECL(int) RTScriptLexScanNumber(RTSCRIPTLEX hScriptLex, uint8_t uBase, bool fAllowReal,
1183 PRTSCRIPTLEXTOKEN pTok)
1184{
1185 RT_NOREF(uBase, fAllowReal, pTok);
1186 PRTSCRIPTLEXINT pThis = hScriptLex;
1187 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1188 AssertReturn(!fAllowReal, VERR_NOT_IMPLEMENTED);
1189 AssertReturn(!uBase, VERR_NOT_IMPLEMENTED);
1190
1191 /** @todo r=aeichner Quick and dirty to have something working for the disassembler testcase.
1192 * Among others it misses overflow handling. */
1193 uBase = 10;
1194 char ch = RTScriptLexGetCh(hScriptLex);
1195 pTok->Type.Number.enmType = ch == '-'
1196 ? RTSCRIPTLEXTOKNUMTYPE_INTEGER
1197 : RTSCRIPTLEXTOKNUMTYPE_NATURAL;
1198 if (ch == '-' || ch == '+')
1199 ch = RTScriptLexConsumeCh(hScriptLex);
1200
1201 if (ch == '0')
1202 {
1203 /* Some hex prefix? */
1204 char chNext = RTScriptLexPeekCh(hScriptLex, 1);
1205 if (chNext == 'x' || chNext == 'X')
1206 {
1207 uBase = 16;
1208 RTScriptLexConsumeCh(hScriptLex);
1209 }
1210 else if (chNext >= '0' && chNext <= '9') /* Octal stuff. */
1211 AssertFailedReturn(VERR_NOT_IMPLEMENTED);
1212
1213 ch = RTScriptLexConsumeCh(hScriptLex);
1214 }
1215
1216 uint64_t u64 = 0;
1217 for (;;)
1218 {
1219 if ( (ch < '0' || ch > '9')
1220 && ( ( !(ch >= 'a' && ch <= 'f')
1221 && !(ch >= 'A' && ch <= 'F'))
1222 || uBase == 10))
1223 {
1224 if (pTok->Type.Number.enmType == RTSCRIPTLEXTOKNUMTYPE_INTEGER)
1225 pTok->Type.Number.Type.i64 = -(int64_t)u64;
1226 else
1227 pTok->Type.Number.Type.u64 = u64;
1228 pTok->enmType = RTSCRIPTLEXTOKTYPE_NUMBER;
1229 pTok->PosEnd = pThis->Pos;
1230 return VINF_SUCCESS;
1231 }
1232
1233 if (ch >= '0' && ch <= '9')
1234 u64 = (u64 * uBase) + (ch - '0');
1235 else if (ch >= 'a' && ch <= 'f')
1236 {
1237 Assert(uBase == 16);
1238 u64 = (u64 << 4) + 10 + (ch - 'a');
1239 }
1240 else if (ch >= 'A' && ch <= 'F')
1241 {
1242 Assert(uBase == 16);
1243 u64 = (u64 << 4) + 10 + (ch - 'A');
1244 }
1245
1246 ch = RTScriptLexConsumeCh(hScriptLex);
1247 }
1248}
1249
1250
1251RTDECL(int) RTScriptLexScanIdentifier(RTSCRIPTLEX hScriptLex, char ch,
1252 PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1253{
1254 PRTSCRIPTLEXINT pThis = hScriptLex;
1255 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1256
1257 const char *pszCharSet = pvUser ? (const char *)pvUser : g_aszIdeCharSetDef;
1258 char aszIde[513]; RT_ZERO(aszIde);
1259 unsigned idx = 0;
1260 aszIde[idx++] = ch;
1261
1262 ch = RTScriptLexGetCh(hScriptLex);
1263 while ( idx < sizeof(aszIde) - 1
1264 && rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
1265 {
1266 aszIde[idx++] = ch;
1267 ch = RTScriptLexGetCh(hScriptLex);
1268 }
1269
1270 if ( idx == sizeof(aszIde) - 1
1271 && rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
1272 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_BUFFER_OVERFLOW, "Lexer: Identifier exceeds the allowed length");
1273
1274 /* Insert into string cache. */
1275 pTok->enmType = RTSCRIPTLEXTOKTYPE_IDENTIFIER;
1276 pTok->Type.Id.pszIde = RTStrCacheEnterN(pThis->hStrCacheId, &aszIde[0], idx);
1277 if (RT_UNLIKELY(!pTok->Type.Id.pszIde))
1278 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory inserting identifier into string cache");
1279
1280 pTok->PosEnd = pThis->Pos;
1281 return VINF_SUCCESS;
1282}
1283
1284
1285/**
1286 * Adds the given character to the string literal add the given position, assuring the string
1287 * is always zero terminated.
1288 *
1289 * @returns IPRT status code.
1290 * @param pThis The lexer state.
1291 * @param ch The character to add.
1292 * @param idx At which position to add the character in the string.
1293 */
1294static int rtScriptLexScanStringLiteralChAdd(PRTSCRIPTLEXINT pThis, char ch, uint32_t idx)
1295{
1296 int rc = VINF_SUCCESS;
1297
1298 if ( !pThis->cchStrLitMax
1299 || idx >= pThis->cchStrLitMax - 1)
1300 {
1301 /* Increase memory. */
1302 size_t cchMaxNew = pThis->cchStrLitMax + 64;
1303 char *pszNew = pThis->pszStrLit;
1304 rc = RTStrRealloc(&pszNew, cchMaxNew * sizeof(char));
1305 if (RT_SUCCESS(rc))
1306 {
1307 pThis->pszStrLit = pszNew;
1308 pThis->cchStrLitMax = cchMaxNew;
1309 }
1310 }
1311
1312 if (RT_SUCCESS(rc))
1313 {
1314 pThis->pszStrLit[idx] = ch;
1315 pThis->pszStrLit[idx + 1] = '\0';
1316 }
1317
1318 return rc;
1319}
1320
1321
1322RTDECL(int) RTScriptLexScanStringLiteralC(RTSCRIPTLEX hScriptLex, char ch,
1323 PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1324{
1325 RT_NOREF(ch, pvUser);
1326 PRTSCRIPTLEXINT pThis = hScriptLex;
1327 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1328
1329 uint32_t idxChCur = 0;
1330 int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1331 if (RT_FAILURE(rc))
1332 return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1333
1334 ch = RTScriptLexGetChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1335 for (;;)
1336 {
1337 if (ch == '\0')
1338 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1339 else if (ch == '\"')
1340 {
1341 RTScriptLexConsumeCh(hScriptLex);
1342
1343 /* End of string, add it to the string literal cache and build the token. */
1344 pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1345 pTok->Type.StringLit.cchString = idxChCur;
1346 pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1347 if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1348 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1349 else
1350 break;
1351 }
1352 else if (ch == '\\')
1353 {
1354 /* Start of escape sequence. */
1355 RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1356 ch = RTScriptLexGetChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1357 switch (ch)
1358 {
1359 case 'a': /* Alert (Bell) */
1360 ch = 0x07;
1361 break;
1362 case 'b': /* Backspace */
1363 ch = 0x08;
1364 break;
1365 case 'e': /* Escape character */
1366 ch = 0x1b;
1367 break;
1368 case 'f': /* Formfeed */
1369 ch = 0x0c;
1370 break;
1371 case 'n': /* Newline (line freed) */
1372 ch = 0x0a;
1373 break;
1374 case 'r': /* Carriage return */
1375 ch = 0x0d;
1376 break;
1377 case 't': /* Horizontal tab */
1378 ch = 0x09;
1379 break;
1380 case 'v': /* Vertical tab */
1381 ch = 0x0b;
1382 break;
1383 case '\\':
1384 case '\'':
1385 case '\"':
1386 case '\?':
1387 /* Can be added as is. */
1388 break;
1389 case 'x': /* Hexdecimal byte. */
1390 case '0': /* Octal */
1391 case '1':
1392 case '2':
1393 case '3':
1394 case '4':
1395 case '5':
1396 case '6':
1397 case '7':
1398 case '8':
1399 case '9':
1400 case 'u': /* Unicode point below 10000 */
1401 case 'U': /* Unicode point */
1402 default:
1403 /* Not supported for now. */
1404 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NOT_SUPPORTED, "Lexer: Invalid/unsupported escape sequence");
1405 }
1406 }
1407
1408 rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1409 if (RT_SUCCESS(rc))
1410 idxChCur++;
1411 else
1412 return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1413
1414 ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1415 }
1416
1417 pTok->PosEnd = pThis->Pos;
1418 return VINF_SUCCESS;
1419}
1420
1421
1422RTDECL(int) RTScriptLexScanStringLiteralPascal(RTSCRIPTLEX hScriptLex, char ch,
1423 PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1424{
1425 RT_NOREF(ch, pvUser);
1426 PRTSCRIPTLEXINT pThis = hScriptLex;
1427 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1428
1429 uint32_t idxChCur = 0;
1430 int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1431 if (RT_FAILURE(rc))
1432 return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1433
1434 ch = RTScriptLexGetChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1435 for (;;)
1436 {
1437 if (ch == '\0')
1438 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1439 else if (ch == '\'')
1440 {
1441 /*
1442 * Check whether there is a second ' coming afterwards used for
1443 * escaping ' characters.
1444 */
1445 ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1446 if (ch != '\'')
1447 {
1448 /* End of string, add it to the string literal cache and build the token. */
1449 pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1450 pTok->Type.StringLit.cchString = idxChCur;
1451 pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1452 if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1453 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1454 else
1455 break;
1456 }
1457 /* else: Fall through and add the character to the string literal..*/
1458 }
1459
1460 rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1461 if (RT_SUCCESS(rc))
1462 idxChCur++;
1463 else
1464 return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1465 ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1466 }
1467
1468 pTok->PosEnd = pThis->Pos;
1469 return VINF_SUCCESS;
1470}
1471
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette