2025-03-29 15:51:03 -04:00

191 lines
6.9 KiB
JavaScript

/**
* @file Batteries-included version of Cheerio. This module includes several
* convenience methods for loading documents from various sources.
*/
export * from './load-parse.js';
export { contains, merge } from './static.js';
import { adapter as htmlparser2Adapter } from 'parse5-htmlparser2-tree-adapter';
import * as htmlparser2 from 'htmlparser2';
import { ParserStream as Parse5Stream } from 'parse5-parser-stream';
import { decodeBuffer, DecodeStream, } from 'encoding-sniffer';
import * as undici from 'undici';
import MIMEType from 'whatwg-mimetype';
import { Writable, finished } from 'node:stream';
import { flattenOptions, } from './options.js';
import { load } from './load-parse.js';
/**
* Sniffs the encoding of a buffer, then creates a querying function bound to a
* document created from the buffer.
*
* @category Loading
* @example
*
* ```js
* import * as cheerio from 'cheerio';
*
* const buffer = fs.readFileSync('index.html');
* const $ = cheerio.fromBuffer(buffer);
* ```
*
* @param buffer - The buffer to sniff the encoding of.
* @param options - The options to pass to Cheerio.
* @returns The loaded document.
*/
export function loadBuffer(buffer, options = {}) {
const opts = flattenOptions(options);
const str = decodeBuffer(buffer, {
defaultEncoding: (opts === null || opts === void 0 ? void 0 : opts.xmlMode) ? 'utf8' : 'windows-1252',
...options.encoding,
});
return load(str, opts);
}
function _stringStream(options, cb) {
var _a;
if (options === null || options === void 0 ? void 0 : options._useHtmlParser2) {
const parser = htmlparser2.createDocumentStream((err, document) => cb(err, load(document)), options);
return new Writable({
decodeStrings: false,
write(chunk, _encoding, callback) {
if (typeof chunk !== 'string') {
throw new TypeError('Expected a string');
}
parser.write(chunk);
callback();
},
final(callback) {
parser.end();
callback();
},
});
}
options !== null && options !== void 0 ? options : (options = {});
(_a = options.treeAdapter) !== null && _a !== void 0 ? _a : (options.treeAdapter = htmlparser2Adapter);
if (options.scriptingEnabled !== false) {
options.scriptingEnabled = true;
}
const stream = new Parse5Stream(options);
finished(stream, (err) => cb(err, load(stream.document)));
return stream;
}
/**
* Creates a stream that parses a sequence of strings into a document.
*
* The stream is a `Writable` stream that accepts strings. When the stream is
* finished, the callback is called with the loaded document.
*
* @category Loading
* @example
*
* ```js
* import * as cheerio from 'cheerio';
* import * as fs from 'fs';
*
* const writeStream = cheerio.stringStream({}, (err, $) => {
* if (err) {
* // Handle error
* }
*
* console.log($('h1').text());
* // Output: Hello, world!
* });
*
* fs.createReadStream('my-document.html', { encoding: 'utf8' }).pipe(
* writeStream,
* );
* ```
*
* @param options - The options to pass to Cheerio.
* @param cb - The callback to call when the stream is finished.
* @returns The writable stream.
*/
export function stringStream(options, cb) {
return _stringStream(flattenOptions(options), cb);
}
/**
* Parses a stream of buffers into a document.
*
* The stream is a `Writable` stream that accepts buffers. When the stream is
* finished, the callback is called with the loaded document.
*
* @category Loading
* @param options - The options to pass to Cheerio.
* @param cb - The callback to call when the stream is finished.
* @returns The writable stream.
*/
export function decodeStream(options, cb) {
var _a;
const { encoding = {}, ...cheerioOptions } = options;
const opts = flattenOptions(cheerioOptions);
// Set the default encoding to UTF-8 for XML mode
(_a = encoding.defaultEncoding) !== null && _a !== void 0 ? _a : (encoding.defaultEncoding = (opts === null || opts === void 0 ? void 0 : opts.xmlMode) ? 'utf8' : 'windows-1252');
const decodeStream = new DecodeStream(encoding);
const loadStream = _stringStream(opts, cb);
decodeStream.pipe(loadStream);
return decodeStream;
}
const defaultRequestOptions = {
method: 'GET',
// Allow redirects by default
maxRedirections: 5,
// NOTE: `throwOnError` currently doesn't work https://github.com/nodejs/undici/issues/1753
throwOnError: true,
// Set an Accept header
headers: {
accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
},
};
/**
* `fromURL` loads a document from a URL.
*
* By default, redirects are allowed and non-2xx responses are rejected.
*
* @category Loading
* @example
*
* ```js
* import * as cheerio from 'cheerio';
*
* const $ = await cheerio.fromURL('https://example.com');
* ```
*
* @param url - The URL to load the document from.
* @param options - The options to pass to Cheerio.
* @returns The loaded document.
*/
export async function fromURL(url, options = {}) {
var _a;
const { requestOptions = defaultRequestOptions, encoding = {}, ...cheerioOptions } = options;
let undiciStream;
// Add headers if none were supplied.
(_a = requestOptions.headers) !== null && _a !== void 0 ? _a : (requestOptions.headers = defaultRequestOptions.headers);
const promise = new Promise((resolve, reject) => {
undiciStream = undici.stream(url, requestOptions, (res) => {
var _a, _b;
const contentType = (_a = res.headers['content-type']) !== null && _a !== void 0 ? _a : 'text/html';
const mimeType = new MIMEType(Array.isArray(contentType) ? contentType[0] : contentType);
if (!mimeType.isHTML() && !mimeType.isXML()) {
throw new RangeError(`The content-type "${contentType}" is neither HTML nor XML.`);
}
// Forward the charset from the header to the decodeStream.
encoding.transportLayerEncodingLabel = mimeType.parameters.get('charset');
/*
* If we allow redirects, we will have entries in the history.
* The last entry will be the final URL.
*/
const history = (_b = res.context) === null || _b === void 0 ? void 0 : _b.history;
const opts = {
encoding,
// Set XML mode based on the MIME type.
xmlMode: mimeType.isXML(),
// Set the `baseURL` to the final URL.
baseURL: history ? history[history.length - 1] : url,
...cheerioOptions,
};
return decodeStream(opts, (err, $) => (err ? reject(err) : resolve($)));
});
});
// Let's make sure the request is completed before returning the promise.
await undiciStream;
return promise;
}
//# sourceMappingURL=index.js.map