stats = $statsDataFactory; $this->htmlTransformFactory = $htmlTransformFactory; $this->parsoidOutputStash = $parsoidOutputStash; $this->envOptions = $envOptions + [ 'outputContentVersion' => Parsoid::defaultHTMLVersion(), 'offsetType' => 'byte', ]; $this->parserOutputAccess = $parserOutputAccess; $this->pageLookup = $pageLookup; $this->revisionLookup = $revisionLookup; if ( $page === null ) { wfDeprecated( __METHOD__ . ' without $page', '1.43' ); } else { $this->initInternal( $page, $body, $parameters, $originalRevision, $pageLanguage ); } } /** * @return array */ public function getParamSettings(): array { // JSON body schema: /* doc: properties: headers: type: array items: type: string body: type: [ string, object ] required: [ body ] body: properties: offsetType: type: string revid: type: integer renderid: type: string etag: type: string html: type: [ doc, string ] data-mw: type: doc original: properties: html: type: doc source: type: doc data-mw: type: doc data-parsoid: type: doc required: [ html ] */ // FUTURE: more params // - slot (for loading the base content) return [ // XXX: should we really declare this here? Or should end endpoint do this? // We are not reading this property... 'title' => [ Handler::PARAM_SOURCE => 'path', ParamValidator::PARAM_TYPE => 'string', ParamValidator::PARAM_DEFAULT => '', ParamValidator::PARAM_REQUIRED => false, ], // XXX: Needed for compatibility with the parsoid transform endpoint. // But revid should just be part of the info about the original data // in the body. 'oldid' => [ Handler::PARAM_SOURCE => 'path', ParamValidator::PARAM_TYPE => 'int', ParamValidator::PARAM_DEFAULT => 0, ParamValidator::PARAM_REQUIRED => false, ], // XXX: Supported for compatibility with the parsoid transform endpoint. // If given, it should be 'html' or 'pagebundle'. 'from' => [ Handler::PARAM_SOURCE => 'path', ParamValidator::PARAM_TYPE => 'string', ParamValidator::PARAM_DEFAULT => '', ParamValidator::PARAM_REQUIRED => false, ], // XXX: Supported for compatibility with the parsoid transform endpoint. // Ignored. 'format' => [ Handler::PARAM_SOURCE => 'path', ParamValidator::PARAM_TYPE => 'string', ParamValidator::PARAM_DEFAULT => '', ParamValidator::PARAM_REQUIRED => false, ], 'contentmodel' => [ // XXX: get this from the Accept header? Handler::PARAM_SOURCE => 'query', ParamValidator::PARAM_TYPE => 'string', ParamValidator::PARAM_DEFAULT => '', ParamValidator::PARAM_REQUIRED => false, ], 'language' => [ // TODO: get this from Accept-Language header?! Handler::PARAM_SOURCE => 'query', ParamValidator::PARAM_TYPE => 'string', ParamValidator::PARAM_DEFAULT => '', ParamValidator::PARAM_REQUIRED => false, ] ]; } /** * Modify body and parameters to provide compatibility with legacy endpoints. * * @see ParsoidHandler::getRequestAttributes * * @param array &$body * @param array &$parameters * * @throws HttpException * * @return void */ private static function normalizeParameters( array &$body, array &$parameters ) { // If the revision ID is given in the path, pretend it was given in the body. if ( isset( $parameters['oldid'] ) && (int)$parameters['oldid'] > 0 ) { $body['original']['revid'] = (int)$parameters['oldid']; } // If an etag is given in the body, use it as the render ID. // Note that we support ETag format in the renderid field. if ( !empty( $body['original']['etag'] ) ) { // @phan-suppress-next-line PhanTypeInvalidDimOffset false positive $body['original']['renderid'] = $body['original']['etag']; } // Accept 'wikitext' as an alias for 'source'. if ( isset( $body['original']['wikitext'] ) ) { // @phan-suppress-next-line PhanTypeInvalidDimOffset false positive $body['original']['source'] = $body['original']['wikitext']; unset( $body['original']['wikitext'] ); } // If 'from' is not set, we accept page bundle style input as well as full HTML. // If 'from' is set, we only accept page bundle style input if it is set to FORMAT_PAGEBUNDLE. if ( isset( $parameters['from'] ) && $parameters['from'] !== '' && $parameters['from'] !== ParsoidFormatHelper::FORMAT_PAGEBUNDLE ) { unset( $body['original']['data-parsoid']['body'] ); unset( $body['original']['data-mw']['body'] ); unset( $body['data-mw']['body'] ); } // If 'from' is given, it must be html or pagebundle. if ( isset( $parameters['from'] ) && $parameters['from'] !== '' && $parameters['from'] !== ParsoidFormatHelper::FORMAT_HTML && $parameters['from'] !== ParsoidFormatHelper::FORMAT_PAGEBUNDLE ) { throw new LocalizedHttpException( new MessageValue( "rest-unsupported-transform-input", [ $parameters['from'] ] ), 400 ); } if ( isset( $body['contentmodel'] ) && $body['contentmodel'] !== '' ) { $parameters['contentmodel'] = $body['contentmodel']; } elseif ( isset( $parameters['format'] ) && $parameters['format'] !== '' ) { $parameters['contentmodel'] = $parameters['format']; } } /** * @param PageIdentity $page * @param array|string $body Body structure, or an HTML string * @param array $parameters * @param RevisionRecord|null $originalRevision * @param Bcp47Code|null $pageLanguage * * @throws HttpException * @deprecated since 1.43; pass arguments to constructor instead */ public function init( PageIdentity $page, $body, array $parameters, ?RevisionRecord $originalRevision = null, ?Bcp47Code $pageLanguage = null ) { wfDeprecated( __METHOD__, '1.43' ); $this->initInternal( $page, $body, $parameters, $originalRevision, $pageLanguage ); } /** * @param PageIdentity $page * @param array|string $body Body structure, or an HTML string * @param array $parameters * @param RevisionRecord|null $originalRevision * @param Bcp47Code|null $pageLanguage * * @throws HttpException */ private function initInternal( PageIdentity $page, $body, array $parameters, ?RevisionRecord $originalRevision = null, ?Bcp47Code $pageLanguage = null ) { if ( is_string( $body ) ) { $body = [ 'html' => $body ]; } self::normalizeParameters( $body, $parameters ); $this->page = $page; if ( !isset( $body['html'] ) ) { throw new LocalizedHttpException( new MessageValue( "rest-missing-body-field", [ 'html' ] ) ); } $html = is_array( $body['html'] ) ? $body['html']['body'] : $body['html']; // TODO: validate $body against a proper schema. $this->transform = $this->htmlTransformFactory->getHtmlToContentTransform( $html, $this->page ); $this->transform->setMetrics( $this->stats ); // NOTE: Env::getContentModel will fall back to the page's recorded content model // if none is set here. $this->transform->setOptions( [ 'contentmodel' => $parameters['contentmodel'] ?? null, 'offsetType' => $body['offsetType'] ?? $this->envOptions['offsetType'], ] ); $original = $body['original'] ?? []; $originalRendering = null; if ( !isset( $original['html'] ) && !empty( $original['renderid'] ) ) { $key = $original['renderid']; if ( preg_match( '!^(W/)?".*"$!', $key ) ) { $originalRendering = ParsoidRenderID::newFromETag( $key ); if ( !$originalRendering ) { throw new LocalizedHttpException( new MessageValue( "rest-bad-etag", [ $key ] ), 400 ); } } else { $originalRendering = ParsoidRenderID::newFromKey( $key ); } } elseif ( !empty( $original['html'] ) || !empty( $original['data-parsoid'] ) ) { // NOTE: We might have an incomplete PageBundle here, with no HTML but with data-parsoid! // XXX: Do we need to support that, or can that just be a 400? $originalRendering = new PageBundle( $original['html']['body'] ?? '', $original['data-parsoid']['body'] ?? null, $original['data-mw']['body'] ?? null, null, // will be derived from $original['html']['headers']['content-type'] $original['html']['headers'] ?? [] ); } if ( !$originalRevision && !empty( $original['revid'] ) ) { $originalRevision = (int)$original['revid']; } if ( $originalRevision || $originalRendering ) { $this->setOriginal( $originalRevision, $originalRendering ); } else { if ( $this->page->exists() ) { $this->stats->increment( 'html_input_transform.original_html.not_given.page_exists' ); } else { $this->stats->increment( 'html_input_transform.original_html.not_given.page_not_exist' ); } } if ( isset( $body['data-mw']['body'] ) ) { $this->transform->setModifiedDataMW( $body['data-mw']['body'] ); } if ( $pageLanguage ) { $this->transform->setContentLanguage( $pageLanguage ); } elseif ( isset( $parameters['language'] ) && $parameters['language'] !== '' ) { $pageLanguage = LanguageCode::normalizeNonstandardCodeAndWarn( $parameters['language'] ); $this->transform->setContentLanguage( $pageLanguage ); } if ( isset( $original['source']['body'] ) ) { // XXX: do we really have to support wikitext overrides? $this->transform->setOriginalText( $original['source']['body'] ); } } /** * Return HTMLTransform object, so additional context can be provided by calling setters on it. * @return HtmlToContentTransform */ public function getTransform(): HtmlToContentTransform { return $this->transform; } /** * Set metrics sink. * * @param StatsdDataFactoryInterface $stats */ public function setMetrics( StatsdDataFactoryInterface $stats ) { $this->stats = $stats; if ( $this->transform ) { $this->transform->setMetrics( $stats ); } } /** * Supply information about the revision and rendering that was the original basis of * the input HTML. This is used to apply selective serialization (selser), if possible. * * @param RevisionRecord|int|null $rev * @param ParsoidRenderID|PageBundle|ParserOutput|null $originalRendering */ public function setOriginal( $rev, $originalRendering ) { if ( $originalRendering instanceof ParsoidRenderID ) { $renderId = $originalRendering; // If the client asked for a render ID, load original data from stash try { $selserContext = $this->fetchSelserContextFromStash( $renderId ); } catch ( InvalidArgumentException $ex ) { $this->stats->increment( 'html_input_transform.original_html.given.as_renderid.bad' ); throw new LocalizedHttpException( new MessageValue( "rest-bad-stash-key" ), 400, [ 'reason' => $ex->getMessage(), 'key' => "$renderId" ] ); } if ( !$selserContext ) { // NOTE: When the client asked for a specific stash key (resp. etag), // we should fail with a 412 if we don't have the specific rendering. // On the other hand, of the client only provided a base revision ID, // we can re-parse and hope for the best. throw new LocalizedHttpException( new MessageValue( "rest-no-stashed-content", [ $renderId->getKey() ] ), 412 ); // TODO: This class should provide getETag and getLastModified methods for use by // the REST endpoint, to provide proper support for conditionals. // However, that requires some refactoring of how HTTP conditional checks // work in the Handler base class. } if ( !$rev ) { $rev = $renderId->getRevisionID(); } $originalRendering = $selserContext->getPageBundle(); $content = $selserContext->getContent(); if ( $content ) { $this->transform->setOriginalContent( $content ); } } elseif ( !$originalRendering && $rev ) { // The client provided a revision ID, but not stash key. // Try to get a rendering for the given revision, and use it as the basis for selser. // Chances are good that the resulting diff will be reasonably clean. // NOTE: If we don't have a revision ID, we should not attempt selser! $originalRendering = $this->fetchParserOutputFromParsoid( $this->page, $rev, true ); if ( $originalRendering ) { $this->stats->increment( 'html_input_transform.original_html.given.as_revid.found' ); } else { $this->stats->increment( 'html_input_transform.original_html.given.as_revid.not_found' ); } } elseif ( $originalRendering ) { $this->stats->increment( 'html_input_transform.original_html.given.verbatim' ); } if ( $originalRendering instanceof ParserOutput ) { $originalRendering = PageBundleParserOutputConverter::pageBundleFromParserOutput( $originalRendering ); // NOTE: Use the default if we got a ParserOutput object. // Don't apply the default if we got passed a PageBundle, // in that case, we want to require the version to be explicit. if ( $originalRendering->version === null && !isset( $originalRendering->headers['content-type'] ) ) { $originalRendering->version = Parsoid::defaultHTMLVersion(); } } if ( !$originalRendering instanceof PageBundle ) { return; } if ( $originalRendering->version !== null ) { $this->transform->setOriginalSchemaVersion( $originalRendering->version ); } elseif ( !empty( $originalRendering->headers['content-type'] ) ) { $vOriginal = ParsoidFormatHelper::parseContentTypeHeader( // @phan-suppress-next-line PhanTypeArraySuspiciousNullable Silly Phan, we just checked. $originalRendering->headers['content-type'] ); if ( $vOriginal ) { $this->transform->setOriginalSchemaVersion( $vOriginal ); } } if ( $rev instanceof RevisionRecord ) { $this->transform->setOriginalRevision( $rev ); } elseif ( $rev && is_int( $rev ) ) { $this->transform->setOriginalRevisionId( $rev ); } // NOTE: We might have an incomplete PageBundle here, with no HTML. // PageBundle::$html is declared to not be nullable, so it would be set to the empty // string if not given. Note however that it might also be null, since it's a public field. if ( $originalRendering->html !== null && $originalRendering->html !== '' ) { $this->transform->setOriginalHtml( $originalRendering->html ); } if ( $originalRendering->parsoid !== null ) { $this->transform->setOriginalDataParsoid( $originalRendering->parsoid ); } if ( $originalRendering->mw !== null ) { $this->transform->setOriginalDataMW( $originalRendering->mw ); } } /** * @return Content the content derived from the input HTML. * @throws HttpException */ public function getContent(): Content { try { return $this->transform->htmlToContent(); } catch ( ClientError $e ) { throw new LocalizedHttpException( new MessageValue( 'rest-html-backend-error', [ $e->getMessage() ] ), 400, [ 'reason' => $e->getMessage() ] ); } catch ( ResourceLimitExceededException $e ) { throw new LocalizedHttpException( new MessageValue( 'rest-resource-limit-exceeded' ), 413, [ 'reason' => $e->getMessage() ] ); } catch ( MWUnknownContentModelException $e ) { throw new LocalizedHttpException( new MessageValue( "rest-unknown-content-model", [ $e->getModelId() ] ), 400 ); } } /** * Creates a response containing the content derived from the input HTML. * This will set the appropriate Content-Type header. * * @param ResponseInterface $response */ public function putContent( ResponseInterface $response ) { $content = $this->getContent(); $data = $content->serialize(); try { $contentType = ParsoidFormatHelper::getContentType( $content->getModel(), $this->envOptions['outputContentVersion'] ); } catch ( InvalidArgumentException $e ) { // If Parsoid doesn't know the content type, // ask the ContentHandler! $contentType = $content->getDefaultFormat(); } $response->setHeader( 'Content-Type', $contentType ); $response->getBody()->write( $data ); } /** * @param PageIdentity $page * @param RevisionRecord|int $revision * @param bool $mayParse * * @return ParserOutput|null * @throws HttpException */ private function fetchParserOutputFromParsoid( PageIdentity $page, $revision, bool $mayParse ): ?ParserOutput { $parserOptions = ParserOptions::newFromAnon(); $parserOptions->setUseParsoid(); try { if ( !$page instanceof PageRecord ) { $name = "$page"; $page = $this->pageLookup->getPageByReference( $page ); if ( !$page ) { throw new RevisionAccessException( 'Page {name} not found', [ 'name' => $name ] ); } } if ( is_int( $revision ) ) { $revId = $revision; $revision = $this->revisionLookup->getRevisionById( $revId, 0, $page ); if ( !$revision ) { throw new RevisionAccessException( 'Revision {revId} not found', [ 'revId' => $revId ] ); } } if ( $page->getId() !== $revision->getPageId() ) { throw new RevisionAccessException( 'Revision {revId} does not belong to page {name}', [ 'name' => $page->getDBkey(), 'revId' => $revision->getId() ] ); } if ( $mayParse ) { try { $status = $this->parserOutputAccess->getParserOutput( $page, $parserOptions, $revision ); } catch ( ClientError $e ) { $status = Status::newFatal( 'parsoid-client-error', $e->getMessage() ); } catch ( ResourceLimitExceededException $e ) { $status = Status::newFatal( 'parsoid-resource-limit-exceeded', $e->getMessage() ); } if ( !$status->isOK() ) { $this->throwHttpExceptionForStatus( $status ); } $parserOutput = $status->getValue(); } else { $parserOutput = $this->parserOutputAccess->getCachedParserOutput( $page, $parserOptions, $revision ); } } catch ( RevisionAccessException $e ) { // The client supplied bad revision ID, or the revision was deleted or suppressed. throw new LocalizedHttpException( new MessageValue( "rest-specified-revision-unavailable" ), 404, [ 'reason' => $e->getMessage() ] ); } return $parserOutput; } /** * @param ParsoidRenderID $renderID * * @return SelserContext|null */ private function fetchSelserContextFromStash( $renderID ): ?SelserContext { $selserContext = $this->parsoidOutputStash->get( $renderID ); if ( $selserContext ) { $this->stats->increment( 'html_input_transform.original_html.given.as_renderid.' . 'stash_hit.found.hit' ); return $selserContext; } else { // Looks like the rendering is gone from stash (or the client send us a bogus key). // Try to load it from the parser cache instead. // On a wiki with low edit frequency, there is a good chance that it's still there. try { $parserOutput = $this->fetchParserOutputFromParsoid( $this->page, $renderID->getRevisionID(), false ); if ( !$parserOutput ) { $this->stats->increment( 'html_input_transform.original_html.given.as_renderid.' . 'stash_miss_pc_fallback.not_found.miss' ); return null; } $cachedRenderID = ParsoidRenderID::newFromParserOutput( $parserOutput ); if ( $cachedRenderID->getKey() !== $renderID->getKey() ) { $this->stats->increment( 'html_input_transform.original_html.given.as_renderid.' . 'stash_miss_pc_fallback.not_found.mismatch' ); // It's not the correct rendering. return null; } $this->stats->increment( 'html_input_transform.original_html.given.as_renderid.' . 'stash_miss_pc_fallback.found.hit' ); $pb = PageBundleParserOutputConverter::pageBundleFromParserOutput( $parserOutput ); return new SelserContext( $pb, $renderID->getRevisionID() ); } catch ( HttpException $e ) { $this->stats->increment( 'html_input_transform.original_html.given.as_renderid.' . 'stash_miss_pc_fallback.not_found.failed' ); // If the revision isn't found, don't trigger a 404. Return null to trigger a 412. return null; } } } /** * @param Status $status * * @return never * @throws HttpException */ private function throwHttpExceptionForStatus( Status $status ) { // TODO: make this nicer. if ( $status->hasMessage( 'parsoid-resource-limit-exceeded' ) ) { throw new LocalizedHttpException( new MessageValue( "rest-parsoid-resource-exceeded" ), 413, [ 'reason' => $status->getHTML() ] ); } else { throw new LocalizedHttpException( new MessageValue( "rest-parsoid-error" ), 400, [ 'reason' => $status->getHTML() ] ); } } }