2025-03-29 15:51:03 -04:00

248 lines
11 KiB
TypeScript

import { Preprocessor } from './preprocessor.js';
import { type Token, type CharacterToken, type DoctypeToken, type TagToken, type EOFToken, type CommentToken, type Attribute, type Location } from '../common/token.js';
import { EntityDecoder } from 'entities/lib/decode.js';
import { ERR, type ParserErrorHandler } from '../common/error-codes.js';
declare const enum State {
DATA = 0,
RCDATA = 1,
RAWTEXT = 2,
SCRIPT_DATA = 3,
PLAINTEXT = 4,
TAG_OPEN = 5,
END_TAG_OPEN = 6,
TAG_NAME = 7,
RCDATA_LESS_THAN_SIGN = 8,
RCDATA_END_TAG_OPEN = 9,
RCDATA_END_TAG_NAME = 10,
RAWTEXT_LESS_THAN_SIGN = 11,
RAWTEXT_END_TAG_OPEN = 12,
RAWTEXT_END_TAG_NAME = 13,
SCRIPT_DATA_LESS_THAN_SIGN = 14,
SCRIPT_DATA_END_TAG_OPEN = 15,
SCRIPT_DATA_END_TAG_NAME = 16,
SCRIPT_DATA_ESCAPE_START = 17,
SCRIPT_DATA_ESCAPE_START_DASH = 18,
SCRIPT_DATA_ESCAPED = 19,
SCRIPT_DATA_ESCAPED_DASH = 20,
SCRIPT_DATA_ESCAPED_DASH_DASH = 21,
SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 22,
SCRIPT_DATA_ESCAPED_END_TAG_OPEN = 23,
SCRIPT_DATA_ESCAPED_END_TAG_NAME = 24,
SCRIPT_DATA_DOUBLE_ESCAPE_START = 25,
SCRIPT_DATA_DOUBLE_ESCAPED = 26,
SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 27,
SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 28,
SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 29,
SCRIPT_DATA_DOUBLE_ESCAPE_END = 30,
BEFORE_ATTRIBUTE_NAME = 31,
ATTRIBUTE_NAME = 32,
AFTER_ATTRIBUTE_NAME = 33,
BEFORE_ATTRIBUTE_VALUE = 34,
ATTRIBUTE_VALUE_DOUBLE_QUOTED = 35,
ATTRIBUTE_VALUE_SINGLE_QUOTED = 36,
ATTRIBUTE_VALUE_UNQUOTED = 37,
AFTER_ATTRIBUTE_VALUE_QUOTED = 38,
SELF_CLOSING_START_TAG = 39,
BOGUS_COMMENT = 40,
MARKUP_DECLARATION_OPEN = 41,
COMMENT_START = 42,
COMMENT_START_DASH = 43,
COMMENT = 44,
COMMENT_LESS_THAN_SIGN = 45,
COMMENT_LESS_THAN_SIGN_BANG = 46,
COMMENT_LESS_THAN_SIGN_BANG_DASH = 47,
COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH = 48,
COMMENT_END_DASH = 49,
COMMENT_END = 50,
COMMENT_END_BANG = 51,
DOCTYPE = 52,
BEFORE_DOCTYPE_NAME = 53,
DOCTYPE_NAME = 54,
AFTER_DOCTYPE_NAME = 55,
AFTER_DOCTYPE_PUBLIC_KEYWORD = 56,
BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 57,
DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 58,
DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 59,
AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 60,
BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 61,
AFTER_DOCTYPE_SYSTEM_KEYWORD = 62,
BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 63,
DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 64,
DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 65,
AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 66,
BOGUS_DOCTYPE = 67,
CDATA_SECTION = 68,
CDATA_SECTION_BRACKET = 69,
CDATA_SECTION_END = 70,
CHARACTER_REFERENCE = 71,
AMBIGUOUS_AMPERSAND = 72
}
export declare const TokenizerMode: {
readonly DATA: State.DATA;
readonly RCDATA: State.RCDATA;
readonly RAWTEXT: State.RAWTEXT;
readonly SCRIPT_DATA: State.SCRIPT_DATA;
readonly PLAINTEXT: State.PLAINTEXT;
readonly CDATA_SECTION: State.CDATA_SECTION;
};
export interface TokenizerOptions {
sourceCodeLocationInfo?: boolean;
}
export interface TokenHandler {
onComment(token: CommentToken): void;
onDoctype(token: DoctypeToken): void;
onStartTag(token: TagToken): void;
onEndTag(token: TagToken): void;
onEof(token: EOFToken): void;
onCharacter(token: CharacterToken): void;
onNullCharacter(token: CharacterToken): void;
onWhitespaceCharacter(token: CharacterToken): void;
onParseError?: ParserErrorHandler | null;
}
export declare class Tokenizer {
protected options: TokenizerOptions;
protected handler: TokenHandler;
preprocessor: Preprocessor;
protected paused: boolean;
/** Ensures that the parsing loop isn't run multiple times at once. */
protected inLoop: boolean;
/**
* Indicates that the current adjusted node exists, is not an element in the HTML namespace,
* and that it is not an integration point for either MathML or HTML.
*
* @see {@link https://html.spec.whatwg.org/multipage/parsing.html#tree-construction}
*/
inForeignNode: boolean;
lastStartTagName: string;
active: boolean;
state: State;
protected returnState: State;
/**
* We use `entities`' `EntityDecoder` to parse character references.
*
* All of the following states are handled by the `EntityDecoder`:
*
* - Named character reference state
* - Numeric character reference state
* - Hexademical character reference start state
* - Hexademical character reference state
* - Decimal character reference state
* - Numeric character reference end state
*/
protected entityDecoder: EntityDecoder;
protected entityStartPos: number;
protected consumedAfterSnapshot: number;
protected currentLocation: Location | null;
protected currentCharacterToken: CharacterToken | null;
protected currentToken: Token | null;
protected currentAttr: Attribute;
constructor(options: TokenizerOptions, handler: TokenHandler);
protected _err(code: ERR, cpOffset?: number): void;
protected getCurrentLocation(offset: number): Location | null;
protected _runParsingLoop(): void;
pause(): void;
resume(writeCallback?: () => void): void;
write(chunk: string, isLastChunk: boolean, writeCallback?: () => void): void;
insertHtmlAtCurrentPos(chunk: string): void;
protected _ensureHibernation(): boolean;
protected _consume(): number;
protected _advanceBy(count: number): void;
protected _consumeSequenceIfMatch(pattern: string, caseSensitive: boolean): boolean;
protected _createStartTagToken(): void;
protected _createEndTagToken(): void;
protected _createCommentToken(offset: number): void;
protected _createDoctypeToken(initialName: string | null): void;
protected _createCharacterToken(type: CharacterToken['type'], chars: string): void;
protected _createAttr(attrNameFirstCh: string): void;
protected _leaveAttrName(): void;
protected _leaveAttrValue(): void;
protected prepareToken(ct: Token): void;
protected emitCurrentTagToken(): void;
protected emitCurrentComment(ct: CommentToken): void;
protected emitCurrentDoctype(ct: DoctypeToken): void;
protected _emitCurrentCharacterToken(nextLocation: Location | null): void;
protected _emitEOFToken(): void;
protected _appendCharToCurrentCharacterToken(type: CharacterToken['type'], ch: string): void;
protected _emitCodePoint(cp: number): void;
protected _emitChars(ch: string): void;
protected _startCharacterReference(): void;
protected _isCharacterReferenceInAttribute(): boolean;
protected _flushCodePointConsumedAsCharacterReference(cp: number): void;
protected _callState(cp: number): void;
protected _stateData(cp: number): void;
protected _stateRcdata(cp: number): void;
protected _stateRawtext(cp: number): void;
protected _stateScriptData(cp: number): void;
protected _statePlaintext(cp: number): void;
protected _stateTagOpen(cp: number): void;
protected _stateEndTagOpen(cp: number): void;
protected _stateTagName(cp: number): void;
protected _stateRcdataLessThanSign(cp: number): void;
protected _stateRcdataEndTagOpen(cp: number): void;
protected handleSpecialEndTag(_cp: number): boolean;
protected _stateRcdataEndTagName(cp: number): void;
protected _stateRawtextLessThanSign(cp: number): void;
protected _stateRawtextEndTagOpen(cp: number): void;
protected _stateRawtextEndTagName(cp: number): void;
protected _stateScriptDataLessThanSign(cp: number): void;
protected _stateScriptDataEndTagOpen(cp: number): void;
protected _stateScriptDataEndTagName(cp: number): void;
protected _stateScriptDataEscapeStart(cp: number): void;
protected _stateScriptDataEscapeStartDash(cp: number): void;
protected _stateScriptDataEscaped(cp: number): void;
protected _stateScriptDataEscapedDash(cp: number): void;
protected _stateScriptDataEscapedDashDash(cp: number): void;
protected _stateScriptDataEscapedLessThanSign(cp: number): void;
protected _stateScriptDataEscapedEndTagOpen(cp: number): void;
protected _stateScriptDataEscapedEndTagName(cp: number): void;
protected _stateScriptDataDoubleEscapeStart(cp: number): void;
protected _stateScriptDataDoubleEscaped(cp: number): void;
protected _stateScriptDataDoubleEscapedDash(cp: number): void;
protected _stateScriptDataDoubleEscapedDashDash(cp: number): void;
protected _stateScriptDataDoubleEscapedLessThanSign(cp: number): void;
protected _stateScriptDataDoubleEscapeEnd(cp: number): void;
protected _stateBeforeAttributeName(cp: number): void;
protected _stateAttributeName(cp: number): void;
protected _stateAfterAttributeName(cp: number): void;
protected _stateBeforeAttributeValue(cp: number): void;
protected _stateAttributeValueDoubleQuoted(cp: number): void;
protected _stateAttributeValueSingleQuoted(cp: number): void;
protected _stateAttributeValueUnquoted(cp: number): void;
protected _stateAfterAttributeValueQuoted(cp: number): void;
protected _stateSelfClosingStartTag(cp: number): void;
protected _stateBogusComment(cp: number): void;
protected _stateMarkupDeclarationOpen(cp: number): void;
protected _stateCommentStart(cp: number): void;
protected _stateCommentStartDash(cp: number): void;
protected _stateComment(cp: number): void;
protected _stateCommentLessThanSign(cp: number): void;
protected _stateCommentLessThanSignBang(cp: number): void;
protected _stateCommentLessThanSignBangDash(cp: number): void;
protected _stateCommentLessThanSignBangDashDash(cp: number): void;
protected _stateCommentEndDash(cp: number): void;
protected _stateCommentEnd(cp: number): void;
protected _stateCommentEndBang(cp: number): void;
protected _stateDoctype(cp: number): void;
protected _stateBeforeDoctypeName(cp: number): void;
protected _stateDoctypeName(cp: number): void;
protected _stateAfterDoctypeName(cp: number): void;
protected _stateAfterDoctypePublicKeyword(cp: number): void;
protected _stateBeforeDoctypePublicIdentifier(cp: number): void;
protected _stateDoctypePublicIdentifierDoubleQuoted(cp: number): void;
protected _stateDoctypePublicIdentifierSingleQuoted(cp: number): void;
protected _stateAfterDoctypePublicIdentifier(cp: number): void;
protected _stateBetweenDoctypePublicAndSystemIdentifiers(cp: number): void;
protected _stateAfterDoctypeSystemKeyword(cp: number): void;
protected _stateBeforeDoctypeSystemIdentifier(cp: number): void;
protected _stateDoctypeSystemIdentifierDoubleQuoted(cp: number): void;
protected _stateDoctypeSystemIdentifierSingleQuoted(cp: number): void;
protected _stateAfterDoctypeSystemIdentifier(cp: number): void;
protected _stateBogusDoctype(cp: number): void;
protected _stateCdataSection(cp: number): void;
protected _stateCdataSectionBracket(cp: number): void;
protected _stateCdataSectionEnd(cp: number): void;
protected _stateCharacterReference(): void;
protected _stateAmbiguousAmpersand(cp: number): void;
}
export {};