sCharEncoding = get_bloginfo( 'charset' ); $this->oEncrypt = new AmazonAutoLinks_Encrypt; $this->sHTMLCachePrefix = AmazonAutoLinks_Registry::TRANSIENT_PREFIX . "_HTML_"; $this->bIsMBStringInstalled = function_exists( 'mb_language' ); $this->bLoadHTMLFix = defined( 'LIBXML_HTML_NOIMPLIED' ) && defined( 'LIBXML_HTML_NODEFDTD' ) && ( version_compare( PHP_VERSION, '5.4.0' ) >= 0 ); } /** * Creates a DOM object from a given HTML string. * * @remark To output the modified HTML, perform * ` * $_oDoc->saveXML( $_oDoc->documentElement, LIBXML_NOEMPTYTAG ); * ` * @return object DOM object */ public function loadDOMFromHTMLElement( $sHTMLElements, $sMBLang='uni', $sSourceCharSet='' ) { return $this->loadDOMFromHTML( // Enclosing in a div tag prevents from inserting the comment when using saveXML() later. '
' . $sHTMLElements . '
', $sMBLang, $sSourceCharSet ); } /** * Creates a DOM object from a given url. * @return object DOM object * @since unknown * @since 3.2.0 Added the cache duration parameter. */ public function loadDOMFromURL( $sURL, $sMBLang='uni', $bUseFileGetContents=false, $sSourceCharSet='', $iCacheDuration=86400 ) { return $this->loadDOMFromHTML( $this->getHTML( $sURL, $bUseFileGetContents, $iCacheDuration ), $sMBLang, $sSourceCharSet ); } /** * * @param string $sHTML * @param string $sMBLang * @param string $sSourceCharSet If true, it auto-detects the character set. If a string is given, * the HTML string will be converted to the given character set. If false, the HTML string is treated as it is. */ public function loadDOMFromHTML( $sHTML, $sMBLang='uni', $sSourceCharSet='' ) { // without this, the characters get broken if ( ! empty( $sMBLang ) && $this->bIsMBStringInstalled ) { mb_language( $sMBLang ); } if ( false !== $sSourceCharSet ) { $sHTML = $this->convertCharacterEncoding( $sHTML, // subject $this->sCharEncoding, // to $sSourceCharSet, // from false // no html entities conversion ); } // @todo Examine whether the below line takes effect or not. // mb_internal_encoding( $this->sCharEncoding ); $_bInternalErrors = libxml_use_internal_errors( true ); $oDOM = new DOMDocument( '1.0', $this->sCharEncoding ); $oDOM->recover = true; // @see http://stackoverflow.com/a/7386650, http://stackoverflow.com/a/9281963 // $oDOM->sictErrorChecking = false; // @todo examine whether this is necessary or not. $oDOM->preserveWhiteSpace = false; $oDOM->formatOutput = true; $this->_loadHTML( $oDOM, $sHTML ); libxml_use_internal_errors( $_bInternalErrors ); return $oDOM; } /** * Performs the `loadHTML()` DOMDocument method with some additional checks and sanitization. * @return void * @since 3.4.1 */ private function _loadHTML( $oDOM, $sHTML ) { $sHTML = function_exists( 'mb_convert_encoding' ) ? mb_convert_encoding( $sHTML, 'HTML-ENTITIES', $this->sCharEncoding ) : $sHTML; if ( $this->bLoadHTMLFix ) { $oDOM->loadHTML( $sHTML, // subject HTML contents to parse LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD // removes tags ); return; } $oDOM->loadHTML( $sHTML // subject HTML contents to parse ); } /** * * @return string */ public function getInnerHTML( $oNode ) { $sInnerHTML = ""; if ( ! $oNode ) { return $sInnerHTML; } $oChildNodes = $oNode->childNodes; foreach ( $oChildNodes as $_oChildNode ) { $_oTempDom = new DOMDocument( '1.0', $this->sCharEncoding ); $_oImportedNode = $_oTempDom->importNode( $_oChildNode, true ); if ( $_oImportedNode ) { $_oTempDom->appendChild( $_oImportedNode ); } // 3.4.1+ Sometimes tags get inserted. $sInnerHTML .= $this->_getAutoInjectedWrapperTagsRemoved( @$_oTempDom->saveHTML() ); } return $sInnerHTML; } /** * Removes wrapped `` and ``tags from a given string. * * Sometimes $oDOM->saveHTML() returns a string with wrapped. Use this method to remove those. * * @since 2.4.1 * @return string */ private function _getAutoInjectedWrapperTagsRemoved( $sHTML ) { $sHTML = trim( $sHTML ); if ( $this->bLoadHTMLFix ) { return $sHTML; } return preg_replace( '~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $sHTML ); } /** * Fetches HTML body with the specified URL with caching functionality. * * @return string */ public function getHTML( $sURL, $bUseFileGetContents=false, $iCacheDuration=86400 ) { if ( $bUseFileGetContents ) { $_oHTML = new AmazonAutoLinks_HTTPClient_FileGetContents( $sURL, $iCacheDuration ); return $_oHTML->get(); } $_oHTML = new AmazonAutoLinks_HTTPClient( $sURL, $iCacheDuration ); return $_oHTML->get(); } /** * Deletes the cache of the provided URL. */ public function deleteCache( $sURL ) { // @todo delete the item of the custom database table. // or deprecate this method. $this->deleteTransient( $this->sHTMLCachePrefix . md5( $sURL ) ); } /** * Modifies the attributes of the given node elements by specifying a tag name. * * Example: * ` * $oDom->setAttributesByTagName( $oNode, 'a', array( 'target' => '_blank', 'rel' => 'nofollow' ) ); * ` */ public function setAttributesByTagName( $oNode, $sTagName, $aAttributes=array() ) { foreach( $oNode->getElementsByTagName( $sTagName ) as $_oSelectedNode ) { foreach( $this->getAsArray( $aAttributes ) as $_sAttribute => $_sProperty ) { if ( in_array( $_sAttribute, array( 'src', 'href' ) ) ) { $_sProperty = esc_url( $_sProperty ); } @$_oSelectedNode->setAttribute( $_sAttribute, esc_attr( $_sProperty ) ); } } } /** * Removes nodes by tag and class selector. * * Example: * ` * $this->oDOM->removeNodeByTagAndClass( $nodeDiv, 'span', 'riRssTitle' ); * ` */ public function removeNodeByTagAndClass( $oNode, $sTagName, $sClassName, $iIndex='' ) { $oNodes = $oNode->getElementsByTagName( $sTagName ); // If the index is specified, if ( 0 === $iIndex || is_integer( $iIndex ) ) { $oTagNode = $oNodes->item( $iIndex ); if ( $oTagNode ) { if ( stripos( $oTagNode->getAttribute( 'class' ), $sClassName ) !== false ) { $oTagNode->parentNode->removeChild( $oTagNode ); } } } // Otherwise, remove all - Dom is a live object so iterate backwards for ( $i = $oNodes->length - 1; $i >= 0; $i-- ) { $oTagNode = $oNodes->item( $i ); if ( stripos( $oTagNode->getAttribute( 'class' ), $sClassName ) !== false ) { $oTagNode->parentNode->removeChild( $oTagNode ); } } } /** * Removes specified tags from the given dom node. */ public function removeTags( $oDom, array $aTags ) { foreach( $aTags as $_sTag ) { $_oXpath = new DOMXPath( $oDom ); $_oNode = $_oXpath->query( "//*/{$_sTag}" ); foreach( $_oNode as $e ) { $e->parentNode->removeChild( $e ); } } } /** * @return string Returns an outer HTML output of a specified tag. * @since 3.2.0 */ public function getTagOuterHTML( $oDoc, $sTag, $iIndex=0 ) { $_oXpath = new DOMXPath( $oDoc ); $_oTags = $_oXpath->query( "/html/{$sTag}" ); $_oTag = $_oTags->item( $iIndex ); return $oDoc->saveXml( $_oTag, LIBXML_NOEMPTYTAG ); } }