Add automagic-images plugin v1.1.1

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-19 00:18:34 +02:00
parent e51038451a
commit 790860e381
241 changed files with 30733 additions and 0 deletions
@@ -0,0 +1,11 @@
# Security Policy
## Supported Versions
We only support the most recent version with security fixes.
## Reporting a Vulnerability
If you have found any issues that might have security implications, please refer to https://tidelift.com/security
Do not report security reports publicly.
@@ -0,0 +1,38 @@
{
"name": "paquettg/php-html-parser",
"type": "library",
"description": "An HTML DOM parser. It allows you to manipulate HTML. Find tags on an HTML page with selectors just like jQuery.",
"keywords": ["html", "dom", "parser"],
"homepage": "https://github.com/paquettg/php-html-parser",
"license": "MIT",
"authors": [
{
"name": "Gilles Paquette",
"email": "paquettg@gmail.com",
"homepage": "http://gillespaquette.ca"
}
],
"require": {
"php": ">=7.2",
"ext-mbstring": "*",
"ext-zlib": "*",
"ext-curl": "*",
"paquettg/string-encode": "~1.0.0",
"php-http/httplug": "^2.1",
"guzzlehttp/guzzle": "^7.0",
"guzzlehttp/psr7": "^1.6",
"myclabs/php-enum": "^1.7"
},
"require-dev": {
"phpunit/phpunit": "^7.5.1",
"mockery/mockery": "^1.2",
"infection/infection": "^0.13.4",
"phan/phan": "^2.4",
"friendsofphp/php-cs-fixer": "^2.16"
},
"autoload": {
"psr-4": {
"PHPHtmlParser\\": "src/PHPHtmlParser"
}
}
}
@@ -0,0 +1,257 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser;
use PHPHtmlParser\Enum\StringToken;
use PHPHtmlParser\Exceptions\ContentLengthException;
use PHPHtmlParser\Exceptions\LogicalException;
/**
* Class Content.
*/
class Content
{
/**
* The content string.
*
* @var string
*/
protected $content;
/**
* The size of the content.
*
* @var int
*/
protected $size;
/**
* The current position we are in the content.
*
* @var int
*/
protected $pos;
/**
* The following 4 strings are tags that are important to us.
*
* @var string
*/
protected $blank = " \t\r\n";
protected $equal = ' =/>';
protected $slash = " />\r\n\t";
protected $attr = ' >';
/**
* Content constructor.
*/
public function __construct(string $content = '')
{
$this->content = $content;
$this->size = \strlen($content);
$this->pos = 0;
}
/**
* Returns the current position of the content.
*/
public function getPosition(): int
{
return $this->pos;
}
/**
* Gets the current character we are at.
*
* @param ?int $char
*/
public function char(?int $char = null): string
{
return $this->content[$char ?? $this->pos] ?? '';
}
/**
* Gets a string from the current character position.
*
* @param int $length
* @return string
*/
public function string(int $length = 1): string
{
$string = '';
$position = $this->pos;
do {
$string .= $this->char($position++);
} while ($position < $this->pos + $length);
return $string;
}
/**
* Moves the current position forward.
*
* @throws ContentLengthException
*/
public function fastForward(int $count): Content
{
if (!$this->canFastForward($count)) {
// trying to go over the content length, throw exception
throw new ContentLengthException('Attempt to fastForward pass the length of the content.');
}
$this->pos += $count;
return $this;
}
/**
* Checks if we can move the position forward.
*/
public function canFastForward(int $count): bool
{
return \strlen($this->content) >= $this->pos + $count;
}
/**
* Moves the current position backward.
*/
public function rewind(int $count): Content
{
$this->pos -= $count;
if ($this->pos < 0) {
$this->pos = 0;
}
return $this;
}
/**
* Copy the content until we find the given string.
*/
public function copyUntil(string $string, bool $char = false, bool $escape = false): string
{
if ($this->pos >= $this->size) {
// nothing left
return '';
}
if ($escape) {
$position = $this->pos;
$found = false;
while (!$found) {
$position = \strpos($this->content, $string, $position);
if ($position === false) {
// reached the end
break;
}
if ($this->char($position - 1) == '\\') {
// this character is escaped
++$position;
continue;
}
$found = true;
}
} elseif ($char) {
$position = \strcspn($this->content, $string, $this->pos);
$position += $this->pos;
} else {
$position = \strpos($this->content, $string, $this->pos);
}
if ($position === false) {
// could not find character, just return the remaining of the content
$return = \substr($this->content, $this->pos, $this->size - $this->pos);
if ($return === false) {
throw new LogicalException('Substr returned false with position ' . $this->pos . '.');
}
$this->pos = $this->size;
return $return;
}
if ($position == $this->pos) {
// we are at the right place
return '';
}
$return = \substr($this->content, $this->pos, $position - $this->pos);
if ($return === false) {
throw new LogicalException('Substr returned false with position ' . $this->pos . '.');
}
// set the new position
$this->pos = $position;
return $return;
}
/**
* Copies the content until the string is found and return it
* unless the 'unless' is found in the substring.
*/
public function copyUntilUnless(string $string, string $unless): string
{
$lastPos = $this->pos;
$this->fastForward(1);
$foundString = $this->copyUntil($string, true, true);
$position = \strcspn($foundString, $unless);
if ($position == \strlen($foundString)) {
return $string . $foundString;
}
// rewind changes and return nothing
$this->pos = $lastPos;
return '';
}
/**
* Copies the content until it reaches the token string.,.
*
* @uses $this->copyUntil()
*/
public function copyByToken(StringToken $stringToken, bool $char = false, bool $escape = false): string
{
$string = $stringToken->getValue();
return $this->copyUntil($string, $char, $escape);
}
/**
* Skip a given set of characters.
*
* @throws LogicalException
*/
public function skip(string $string, bool $copy = false): string
{
$len = \strspn($this->content, $string, $this->pos);
if ($len === false) {
throw new LogicalException('Strspn returned false with position ' . $this->pos . '.');
}
$return = '';
if ($copy) {
$return = \substr($this->content, $this->pos, $len);
if ($return === false) {
throw new LogicalException('Substr returned false with position ' . $this->pos . '.');
}
}
// update the position
$this->pos += $len;
return $return;
}
/**
* Skip a given token of pre-defined characters.
*
* @uses $this->skip()
*/
public function skipByToken(StringToken $skipToken, bool $copy = false): string
{
$string = $skipToken->getValue();
return $this->skip($string, $copy);
}
}
@@ -0,0 +1,16 @@
<?php
namespace PHPHtmlParser\Contracts\Dom;
use PHPHtmlParser\Exceptions\LogicalException;
use PHPHtmlParser\Options;
interface CleanerInterface
{
/**
* Cleans the html of any none-html information.
*
* @throws LogicalException
*/
public function clean(string $str, Options $options, string $defaultCharset): string;
}
@@ -0,0 +1,33 @@
<?php
namespace PHPHtmlParser\Contracts\Dom;
use PHPHtmlParser\Content;
use PHPHtmlParser\Dom\Node\AbstractNode;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
use PHPHtmlParser\Exceptions\CircularException;
use PHPHtmlParser\Exceptions\ContentLengthException;
use PHPHtmlParser\Exceptions\LogicalException;
use PHPHtmlParser\Exceptions\StrictException;
use PHPHtmlParser\Options;
interface ParserInterface
{
/**
* Attempts to parse the html in content.
*
* @throws ChildNotFoundException
* @throws CircularException
* @throws ContentLengthException
* @throws LogicalException
* @throws StrictException
*/
public function parse(Options $options, Content $content, int $size): AbstractNode;
/**
* Attempts to detect the charset that the html was sent in.
*
* @throws ChildNotFoundException
*/
public function detectCharset(Options $options, string $defaultCharset, AbstractNode $root): bool;
}
@@ -0,0 +1,23 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Contracts;
use PHPHtmlParser\Dom;
use PHPHtmlParser\Options;
use Psr\Http\Client\ClientInterface;
use Psr\Http\Message\RequestInterface;
interface DomInterface
{
public function loadFromFile(string $file, ?Options $options = null): Dom;
public function loadFromUrl(string $url, ?Options $options, ?ClientInterface $client = null, ?RequestInterface $request = null): Dom;
public function loadStr(string $str, ?Options $options = null): Dom;
public function setOptions(Options $options): Dom;
public function find(string $selector, int $nth = null);
}
@@ -0,0 +1,12 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Contracts\Selector;
use PHPHtmlParser\DTO\Selector\ParsedSelectorCollectionDTO;
interface ParserInterface
{
public function parseSelectorString(string $selector): ParsedSelectorCollectionDTO;
}
@@ -0,0 +1,17 @@
<?php
namespace PHPHtmlParser\Contracts\Selector;
use PHPHtmlParser\DTO\Selector\RuleDTO;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
interface SeekerInterface
{
/**
* Attempts to find all children that match the rule
* given.
*
* @throws ChildNotFoundException
*/
public function seek(array $nodes, RuleDTO $rule, array $options): array;
}
@@ -0,0 +1,31 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Contracts\Selector;
use PHPHtmlParser\Dom\Node\AbstractNode;
use PHPHtmlParser\Dom\Node\Collection;
use PHPHtmlParser\DTO\Selector\ParsedSelectorCollectionDTO;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
interface SelectorInterface
{
/**
* Constructs with the selector string.
*/
public function __construct(string $selector, ?ParserInterface $parser = null, ?SeekerInterface $seeker = null);
/**
* Returns the selectors that where found.
*/
public function getParsedSelectorCollectionDTO(): ParsedSelectorCollectionDTO;
/**
* Attempts to find the selectors starting from the given
* node object.
*
* @throws ChildNotFoundException
*/
public function find(AbstractNode $node): Collection;
}
@@ -0,0 +1,41 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\DTO\Selector;
final class ParsedSelectorCollectionDTO
{
/**
* @var ParsedSelectorDTO[]
*/
private $parsedSelectorDTO = [];
/**
* @param ParsedSelectorDTO[] $parsedSelectorDTOs
*/
private function __construct(array $parsedSelectorDTOs)
{
foreach ($parsedSelectorDTOs as $parsedSelectorDTO) {
if ($parsedSelectorDTO instanceof ParsedSelectorDTO) {
$this->parsedSelectorDTO[] = $parsedSelectorDTO;
}
}
}
/**
* @param ParsedSelectorDTO[] $parsedSelectorDTOs
*/
public static function makeCollection(array $parsedSelectorDTOs): ParsedSelectorCollectionDTO
{
return new ParsedSelectorCollectionDTO($parsedSelectorDTOs);
}
/**
* @return ParsedSelectorDTO[]
*/
public function getParsedSelectorDTO(): array
{
return $this->parsedSelectorDTO;
}
}
@@ -0,0 +1,41 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\DTO\Selector;
final class ParsedSelectorDTO
{
/**
* @var RuleDTO[]
*/
private $rules = [];
/**
* @param RuleDTO[] $ruleDTOs
*/
private function __construct(array $ruleDTOs)
{
foreach ($ruleDTOs as $ruleDTO) {
if ($ruleDTO instanceof RuleDTO) {
$this->rules[] = $ruleDTO;
}
}
}
/**
* @param RuleDTO[] $ruleDTOs
*/
public static function makeFromRules(array $ruleDTOs): ParsedSelectorDTO
{
return new ParsedSelectorDTO($ruleDTOs);
}
/**
* @return RuleDTO[]
*/
public function getRules(): array
{
return $this->rules;
}
}
@@ -0,0 +1,100 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\DTO\Selector;
final class RuleDTO
{
/**
* @var string
*/
private $tag;
/**
* @var string
*/
private $operator;
/**
* @var string|array|null
*/
private $key;
/**
* @var string|array|null
*/
private $value;
/**
* @var bool
*/
private $noKey;
/**
* @var bool
*/
private $alterNext;
private function __construct(array $values)
{
$this->tag = $values['tag'];
$this->operator = $values['operator'];
$this->key = $values['key'];
$this->value = $values['value'];
$this->noKey = $values['noKey'];
$this->alterNext = $values['alterNext'];
}
/**
* @param string|array|null $key
* @param string|array|null $value
*/
public static function makeFromPrimitives(string $tag, string $operator, $key, $value, bool $noKey, bool $alterNext): RuleDTO
{
return new RuleDTO([
'tag' => $tag,
'operator' => $operator,
'key' => $key,
'value' => $value,
'noKey' => $noKey,
'alterNext' => $alterNext,
]);
}
public function getTag(): string
{
return $this->tag;
}
public function getOperator(): string
{
return $this->operator;
}
/**
* @return string|array|null
*/
public function getKey()
{
return $this->key;
}
/**
* @return string|array|null
*/
public function getValue()
{
return $this->value;
}
public function isNoKey(): bool
{
return $this->noKey;
}
public function isAlterNext(): bool
{
return $this->alterNext;
}
}
@@ -0,0 +1,60 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\DTO\Tag;
use stringEncode\Encode;
use stringEncode\Exception;
final class AttributeDTO
{
/**
* @var ?string
*/
private $value;
/**
* @var bool
*/
private $doubleQuote;
private function __construct(array $values)
{
$this->value = $values['value'];
$this->doubleQuote = $values['doubleQuote'] ?? true;
}
public static function makeFromPrimitives(?string $value, bool $doubleQuote = true): AttributeDTO
{
return new AttributeDTO([
'value' => $value,
'doubleQuote' => $doubleQuote,
]);
}
public function getValue(): ?string
{
return $this->value;
}
public function isDoubleQuote(): bool
{
return $this->doubleQuote;
}
public function htmlspecialcharsDecode(): void
{
if (!\is_null($this->value)) {
$this->value = \htmlspecialchars_decode($this->value);
}
}
/**
* @throws Exception
*/
public function encodeValue(Encode $encode)
{
$this->value = $encode->convert($this->value);
}
}
@@ -0,0 +1,74 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\DTO;
use PHPHtmlParser\Dom\Node\HtmlNode;
final class TagDTO
{
/**
* @var bool
*/
private $status;
/**
* @var bool
*/
private $closing;
/**
* @var ?HtmlNode
*/
private $node;
/**
* @var ?string
*/
private $tag;
private function __construct(array $values = [])
{
$this->status = $values['status'] ?? false;
$this->closing = $values['closing'] ?? false;
$this->node = $values['node'] ?? null;
$this->tag = $values['tag'] ?? null;
}
public static function makeFromPrimitives(bool $status = false, bool $closing = false, ?HtmlNode $node = null, ?string $tag = null): TagDTO
{
return new TagDTO([
'status' => $status,
'closing' => $closing,
'node' => $node,
'tag' => $tag,
]);
}
public function isStatus(): bool
{
return $this->status;
}
public function isClosing(): bool
{
return $this->closing;
}
/**
* @return mixed
*/
public function getNode(): ?HtmlNode
{
return $this->node;
}
/**
* @return mixed
*/
public function getTag(): ?string
{
return $this->tag;
}
}
@@ -0,0 +1,25 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Discovery;
use PHPHtmlParser\Contracts\Dom\CleanerInterface;
use PHPHtmlParser\Dom\Cleaner;
class CleanerDiscovery
{
/**
* @var Cleaner|null
*/
private static $parser = null;
public static function find(): CleanerInterface
{
if (self::$parser == null) {
self::$parser = new Cleaner();
}
return self::$parser;
}
}
@@ -0,0 +1,25 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Discovery;
use PHPHtmlParser\Contracts\Dom\ParserInterface;
use PHPHtmlParser\Dom\Parser;
class DomParserDiscovery
{
/**
* @var ParserInterface|null
*/
private static $parser = null;
public static function find(): ParserInterface
{
if (self::$parser == null) {
self::$parser = new Parser();
}
return self::$parser;
}
}
@@ -0,0 +1,25 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Discovery;
use PHPHtmlParser\Contracts\Selector\SeekerInterface;
use PHPHtmlParser\Selector\Seeker;
class SeekerDiscovery
{
/**
* @var SeekerInterface|null
*/
private static $seeker = null;
public static function find(): SeekerInterface
{
if (self::$seeker == null) {
self::$seeker = new Seeker();
}
return self::$seeker;
}
}
@@ -0,0 +1,25 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Discovery;
use PHPHtmlParser\Contracts\Selector\ParserInterface;
use PHPHtmlParser\Selector\Parser;
class SelectorParserDiscovery
{
/**
* @var ParserInterface|null
*/
private static $parser = null;
public static function find(): ParserInterface
{
if (self::$parser == null) {
self::$parser = new Parser();
}
return self::$parser;
}
}
@@ -0,0 +1,251 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Client;
use PHPHtmlParser\Contracts\Dom\CleanerInterface;
use PHPHtmlParser\Contracts\Dom\ParserInterface;
use PHPHtmlParser\Contracts\DomInterface;
use PHPHtmlParser\Discovery\CleanerDiscovery;
use PHPHtmlParser\Discovery\DomParserDiscovery;
use PHPHtmlParser\Dom\Node\Collection;
use PHPHtmlParser\Dom\RootAccessTrait;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
use PHPHtmlParser\Exceptions\CircularException;
use PHPHtmlParser\Exceptions\LogicalException;
use PHPHtmlParser\Exceptions\NotLoadedException;
use PHPHtmlParser\Exceptions\StrictException;
use PHPHtmlParser\Exceptions\UnknownChildTypeException;
use Psr\Http\Client\ClientExceptionInterface;
use Psr\Http\Client\ClientInterface;
use Psr\Http\Message\RequestInterface;
/**
* Class Dom.
*/
class Dom implements DomInterface
{
use RootAccessTrait;
/**
* The charset we would like the output to be in.
*
* @var string
*/
private $defaultCharset = 'UTF-8';
/**
* The document string.
*
* @var Content
*/
private $content;
/**
* A global options array to be used by all load calls.
*
* @var ?Options
*/
private $globalOptions;
/**
* @var ParserInterface
*/
private $domParser;
/**
* @var CleanerInterface
*/
private $domCleaner;
public function __construct(?ParserInterface $domParser = null, ?CleanerInterface $domCleaner = null)
{
if ($domParser === null) {
$domParser = DomParserDiscovery::find();
}
if ($domCleaner === null) {
$domCleaner = CleanerDiscovery::find();
}
$this->domParser = $domParser;
$this->domCleaner = $domCleaner;
}
/**
* Returns the inner html of the root node.
*
* @throws ChildNotFoundException
* @throws UnknownChildTypeException
* @throws NotLoadedException
*/
public function __toString(): string
{
$this->isLoaded();
return $this->root->innerHtml();
}
/**
* Loads the dom from a document file/url.
*
* @throws ChildNotFoundException
* @throws CircularException
* @throws Exceptions\ContentLengthException
* @throws LogicalException
* @throws StrictException
*/
public function loadFromFile(string $file, ?Options $options = null): Dom
{
$content = @\file_get_contents($file);
if ($content === false) {
throw new LogicalException('file_get_contents failed and returned false when trying to read "' . $file . '".');
}
return $this->loadStr($content, $options);
}
/**
* Use a curl interface implementation to attempt to load
* the content from a url.
*
* @throws ChildNotFoundException
* @throws CircularException
* @throws Exceptions\ContentLengthException
* @throws LogicalException
* @throws StrictException
* @throws ClientExceptionInterface
*/
public function loadFromUrl(string $url, ?Options $options = null, ?ClientInterface $client = null, ?RequestInterface $request = null): Dom
{
if ($client === null) {
$client = new Client();
}
if ($request === null) {
$request = new Request('GET', $url);
}
$response = $client->sendRequest($request);
$content = $response->getBody()->getContents();
return $this->loadStr($content, $options);
}
/**
* Parsers the html of the given string. Used for load(), loadFromFile(),
* and loadFromUrl().
*
* @throws ChildNotFoundException
* @throws CircularException
* @throws Exceptions\ContentLengthException
* @throws LogicalException
* @throws StrictException
*/
public function loadStr(string $str, ?Options $options = null): Dom
{
$localOptions = new Options();
if ($this->globalOptions !== null) {
$localOptions = $localOptions->setFromOptions($this->globalOptions);
}
if ($options !== null) {
$localOptions = $localOptions->setFromOptions($options);
}
$html = $this->domCleaner->clean($str, $localOptions, $this->defaultCharset);
$this->content = new Content($html);
$this->root = $this->domParser->parse($localOptions, $this->content, \strlen($str));
$this->domParser->detectCharset($localOptions, $this->defaultCharset, $this->root);
return $this;
}
/**
* Sets a global options array to be used by all load calls.
*/
public function setOptions(Options $options): Dom
{
$this->globalOptions = $options;
return $this;
}
/**
* Find elements by css selector on the root node.
*
* @throws NotLoadedException
* @throws ChildNotFoundException
*
* @return mixed|Collection|null
*/
public function find(string $selector, int $nth = null)
{
$this->isLoaded();
return $this->root->find($selector, $nth);
}
/**
* Simple wrapper function that returns an element by the
* id.
*
* @param $id
*
* @throws NotLoadedException
* @throws ChildNotFoundException
*
* @return mixed|Collection|null
*/
public function getElementById($id)
{
$this->isLoaded();
return $this->find('#' . $id, 0);
}
/**
* Simple wrapper function that returns all elements by
* tag name.
*
* @throws NotLoadedException
* @throws ChildNotFoundException
*
* @return mixed|Collection|null
*/
public function getElementsByTag(string $name)
{
$this->isLoaded();
return $this->find($name);
}
/**
* Simple wrapper function that returns all elements by
* class name.
*
* @throws NotLoadedException
* @throws ChildNotFoundException
*
* @return mixed|Collection|null
*/
public function getElementsByClass(string $class)
{
$this->isLoaded();
return $this->find('.' . $class);
}
/**
* Checks if the load methods have been called.
*
* @throws NotLoadedException
*/
private function isLoaded(): void
{
if (\is_null($this->content)) {
throw new NotLoadedException('Content is not loaded!');
}
}
}
@@ -0,0 +1,130 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Dom;
use PHPHtmlParser\Contracts\Dom\CleanerInterface;
use PHPHtmlParser\Exceptions\LogicalException;
use PHPHtmlParser\Options;
class Cleaner implements CleanerInterface
{
/**
* Cleans the html of any none-html information.
*
* @throws LogicalException
*/
public function clean(string $str, Options $options, string $defaultCharset): string
{
if (!$options->isCleanupInput()) {
// skip entire cleanup step
return $str;
}
// check if the string is gziped
$is_gzip = 0 === \mb_strpos($str, "\x1f" . "\x8b" . "\x08", 0, 'US-ASCII');
if ($is_gzip) {
$str = \gzdecode($str);
if ($str === false) {
throw new LogicalException('gzdecode returned false. Error when trying to decode the string.');
}
}
// we must handle character encoding
$str = $this->setUpRegexEncoding($str, $options, $defaultCharset);
// remove white space before closing tags
$str = \mb_eregi_replace("'\s+>", "'>", $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean single quotes.');
}
$str = \mb_eregi_replace('"\s+>', '">', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean double quotes.');
}
// clean out the \n\r
$replace = ' ';
if ($options->isPreserveLineBreaks()) {
$replace = '&#10;';
}
$str = \str_replace(["\r\n", "\r", "\n"], $replace, $str);
if ($str === false) {
throw new LogicalException('str_replace returned false instead of a string. Error when attempting to clean input string.');
}
// strip the doctype
$str = \mb_eregi_replace('<!doctype(.*?)>', '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip the doctype.');
}
// strip out comments
$str = \mb_eregi_replace('<!--(.*?)-->', '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip comments.');
}
// strip out cdata
$str = \mb_eregi_replace("<!\[CDATA\[(.*?)\]\]>", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out cdata.');
}
// strip out <script> tags
if ($options->isRemoveScripts()) {
$str = \mb_eregi_replace("<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to remove scripts 1.');
}
$str = \mb_eregi_replace("<\s*script\s*>(.*?)<\s*/\s*script\s*>", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to remove scripts 2.');
}
}
// strip out <style> tags
if ($options->isRemoveStyles()) {
$str = \mb_eregi_replace("<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out style tags 1.');
}
$str = \mb_eregi_replace("<\s*style\s*>(.*?)<\s*/\s*style\s*>", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out style tags 2.');
}
}
// strip smarty scripts
if ($options->isRemoveSmartyScripts()) {
$str = \mb_eregi_replace("(\{\w)(.*?)(\})", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to remove smarty scripts.');
}
}
return $str;
}
/**
* Sets up the mb_regex_encoding and converts the text to that encoding.
*
* @throws LogicalException
*/
private function setUpRegexEncoding(string $str, Options $options, string $defaultCharset): string
{
$encoding = $defaultCharset;
$enforceEncoding = $options->getEnforceEncoding();
if ($enforceEncoding !== null) {
// they want to enforce the given encoding
$encoding = $enforceEncoding;
}
if (!\mb_regex_encoding($encoding)) {
throw new LogicalException('Character encoding was not able to be changed to ' . $encoding . '.');
}
return \mb_convert_encoding($str, $encoding);
}
}
@@ -0,0 +1,495 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Dom\Node;
use PHPHtmlParser\Contracts\Selector\SelectorInterface;
use PHPHtmlParser\Dom\Tag;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
use PHPHtmlParser\Exceptions\CircularException;
use PHPHtmlParser\Exceptions\ParentNotFoundException;
use PHPHtmlParser\Exceptions\Tag\AttributeNotFoundException;
use PHPHtmlParser\Finder;
use PHPHtmlParser\Selector\Selector;
use stringEncode\Encode;
/**
* Dom node object.
*
* @property-read string $outerhtml
* @property-read string $innerhtml
* @property-read string $innerText
* @property-read string $text
* @property-read Tag $tag
* @property-read InnerNode $parent
*/
abstract class AbstractNode
{
/**
* Contains the tag name/type.
*
* @var ?Tag
*/
protected $tag;
/**
* Contains a list of attributes on this tag.
*
* @var array
*/
protected $attr = [];
/**
* Contains the parent Node.
*
* @var ?InnerNode
*/
protected $parent;
/**
* The unique id of the class. Given by PHP.
*
* @var int
*/
protected $id;
/**
* The encoding class used to encode strings.
*
* @var mixed
*/
protected $encode;
/**
* An array of all the children.
*
* @var array
*/
protected $children = [];
/**
* @var bool
*/
protected $htmlSpecialCharsDecode = false;
/**
* @var int
*/
private static $count = 0;
/**
* Creates a unique id for this node.
*/
public function __construct()
{
$this->id = self::$count;
++self::$count;
}
/**
* Attempts to clear out any object references.
*/
public function __destruct()
{
$this->tag = null;
$this->parent = null;
$this->attr = [];
$this->children = [];
}
/**
* Magic get method for attributes and certain methods.
*
* @return mixed
*/
public function __get(string $key)
{
// check attribute first
if ($this->getAttribute($key) !== null) {
return $this->getAttribute($key);
}
switch (\strtolower($key)) {
case 'outerhtml':
return $this->outerHtml();
case 'innerhtml':
return $this->innerHtml();
case 'innertext':
return $this->innerText();
case 'text':
return $this->text();
case 'tag':
return $this->getTag();
case 'parent':
return $this->getParent();
}
}
/**
* Simply calls the outer text method.
*
* @return string
*/
public function __toString()
{
return $this->outerHtml();
}
/**
* @param bool $htmlSpecialCharsDecode
*/
public function setHtmlSpecialCharsDecode($htmlSpecialCharsDecode = false): void
{
$this->htmlSpecialCharsDecode = $htmlSpecialCharsDecode;
}
/**
* Returns the id of this object.
*/
public function id(): int
{
return $this->id;
}
/**
* Returns the parent of node.
*
* @return InnerNode
*/
public function getParent(): ?InnerNode
{
return $this->parent;
}
/**
* Sets the parent node.
*
* @throws ChildNotFoundException
* @throws CircularException
*/
public function setParent(InnerNode $parent): AbstractNode
{
// remove from old parent
if ($this->parent !== null) {
if ($this->parent->id() == $parent->id()) {
// already the parent
return $this;
}
$this->parent->removeChild($this->id);
}
$this->parent = $parent;
// assign child to parent
$this->parent->addChild($this);
return $this;
}
/**
* Removes this node and all its children from the
* DOM tree.
*
* @return void
*/
public function delete()
{
if ($this->parent !== null) {
$this->parent->removeChild($this->id);
}
$this->parent->clear();
$this->clear();
}
/**
* Sets the encoding class to this node.
*
* @return void
*/
public function propagateEncoding(Encode $encode)
{
$this->encode = $encode;
$this->tag->setEncoding($encode);
}
/**
* Checks if the given node id is an ancestor of
* the current node.
*/
public function isAncestor(int $id): bool
{
if ($this->getAncestor($id) !== null) {
return true;
}
return false;
}
/**
* Attempts to get an ancestor node by the given id.
*
* @return AbstractNode|null
*/
public function getAncestor(int $id)
{
if ($this->parent !== null) {
if ($this->parent->id() == $id) {
return $this->parent;
}
return $this->parent->getAncestor($id);
}
}
/**
* Checks if the current node has a next sibling.
*/
public function hasNextSibling(): bool
{
try {
$this->nextSibling();
// sibling found, return true;
return true;
} catch (ParentNotFoundException $e) {
// no parent, no next sibling
unset($e);
return false;
} catch (ChildNotFoundException $e) {
// no sibling found
unset($e);
return false;
}
}
/**
* Attempts to get the next sibling.
*
* @throws ChildNotFoundException
* @throws ParentNotFoundException
*/
public function nextSibling(): AbstractNode
{
if ($this->parent === null) {
throw new ParentNotFoundException('Parent is not set for this node.');
}
return $this->parent->nextChild($this->id);
}
/**
* Attempts to get the previous sibling.
*
* @throws ChildNotFoundException
* @throws ParentNotFoundException
*/
public function previousSibling(): AbstractNode
{
if ($this->parent === null) {
throw new ParentNotFoundException('Parent is not set for this node.');
}
return $this->parent->previousChild($this->id);
}
/**
* Gets the tag object of this node.
*/
public function getTag(): Tag
{
return $this->tag;
}
/**
* Replaces the tag for this node.
*
* @param string|Tag $tag
*/
public function setTag($tag): AbstractNode
{
if (\is_string($tag)) {
$tag = new Tag($tag);
}
$this->tag = $tag;
// clear any cache
$this->clear();
return $this;
}
/**
* A wrapper method that simply calls the getAttribute method
* on the tag of this node.
*/
public function getAttributes(): array
{
$attributes = $this->tag->getAttributes();
foreach ($attributes as $name => $attributeDTO) {
$attributes[$name] = $attributeDTO->getValue();
}
return $attributes;
}
/**
* A wrapper method that simply calls the getAttribute method
* on the tag of this node.
*/
public function getAttribute(string $key): ?string
{
try {
$attributeDTO = $this->tag->getAttribute($key);
} catch (AttributeNotFoundException $e) {
// no attribute with this key exists, returning null.
unset($e);
return null;
}
return $attributeDTO->getValue();
}
/**
* A wrapper method that simply calls the hasAttribute method
* on the tag of this node.
*/
public function hasAttribute(string $key): bool
{
return $this->tag->hasAttribute($key);
}
/**
* A wrapper method that simply calls the setAttribute method
* on the tag of this node.
*/
public function setAttribute(string $key, ?string $value, bool $doubleQuote = true): AbstractNode
{
$this->tag->setAttribute($key, $value, $doubleQuote);
//clear any cache
$this->clear();
return $this;
}
/**
* A wrapper method that simply calls the removeAttribute method
* on the tag of this node.
*/
public function removeAttribute(string $key): void
{
$this->tag->removeAttribute($key);
//clear any cache
$this->clear();
}
/**
* A wrapper method that simply calls the removeAllAttributes
* method on the tag of this node.
*/
public function removeAllAttributes(): void
{
$this->tag->removeAllAttributes();
//clear any cache
$this->clear();
}
/**
* Function to locate a specific ancestor tag in the path to the root.
*
* @throws ParentNotFoundException
*/
public function ancestorByTag(string $tag): AbstractNode
{
// Start by including ourselves in the comparison.
$node = $this;
do {
if ($node->tag->name() == $tag) {
return $node;
}
$node = $node->getParent();
} while ($node !== null);
throw new ParentNotFoundException('Could not find an ancestor with "' . $tag . '" tag');
}
/**
* Find elements by css selector.
*
* @throws ChildNotFoundException
*
* @return mixed|Collection|null
*/
public function find(string $selectorString, ?int $nth = null, ?SelectorInterface $selector = null)
{
if (\is_null($selector)) {
$selector = new Selector($selectorString);
}
$nodes = $selector->find($this);
if ($nth !== null) {
// return nth-element or array
if (isset($nodes[$nth])) {
return $nodes[$nth];
}
return;
}
return $nodes;
}
/**
* Find node by id.
*
* @throws ChildNotFoundException
* @throws ParentNotFoundException
*
* @return bool|AbstractNode
*/
public function findById(int $id)
{
$finder = new Finder($id);
return $finder->find($this);
}
/**
* Gets the inner html of this node.
*/
abstract public function innerHtml(): string;
/**
* Gets the html of this node, including it's own
* tag.
*/
abstract public function outerHtml(): string;
/**
* Gets the text of this node (if there is any text).
*/
abstract public function text(): string;
/**
* Check is node type textNode.
*/
public function isTextNode(): bool
{
return false;
}
/**
* Call this when something in the node tree has changed. Like a child has been added
* or a parent has been changed.
*/
abstract protected function clear(): void;
}
@@ -0,0 +1,45 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Dom\Node;
use ArrayIterator;
use Countable;
use IteratorAggregate;
use PHPHtmlParser\Dom\Tag;
/**
* Dom node object which will allow users to use it as
* an array.
*
* @property-read string $outerhtml
* @property-read string $innerhtml
* @property-read string $innerText
* @property-read string $text
* @property-read Tag $tag
* @property-read InnerNode $parent
*/
abstract class ArrayNode extends AbstractNode implements IteratorAggregate, Countable
{
/**
* Gets the iterator.
*/
public function getIterator(): ArrayIterator
{
return new ArrayIterator($this->getIteratorArray());
}
/**
* Returns the count of the iterator array.
*/
public function count(): int
{
return \count($this->getIteratorArray());
}
/**
* Returns the array to be used the the iterator.
*/
abstract protected function getIteratorArray(): array;
}
@@ -0,0 +1,156 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Dom\Node;
use ArrayAccess;
use ArrayIterator;
use Countable;
use IteratorAggregate;
use PHPHtmlParser\Exceptions\EmptyCollectionException;
/**
* Class Collection.
*/
class Collection implements IteratorAggregate, ArrayAccess, Countable
{
/**
* The collection of Nodes.
*
* @var array
*/
protected $collection = [];
/**
* Attempts to call the method on the first node in
* the collection.
*
* @throws EmptyCollectionException
*
* @return mixed
*/
public function __call(string $method, array $arguments)
{
$node = \reset($this->collection);
if ($node instanceof AbstractNode) {
return \call_user_func_array([$node, $method], $arguments);
}
throw new EmptyCollectionException('The collection does not contain any Nodes.');
}
/**
* Attempts to apply the magic get to the first node
* in the collection.
*
* @param mixed $key
*
* @throws EmptyCollectionException
*
* @return mixed
*/
public function __get($key)
{
$node = \reset($this->collection);
if ($node instanceof AbstractNode) {
return $node->$key;
}
throw new EmptyCollectionException('The collection does not contain any Nodes.');
}
/**
* Applies the magic string method to the first node in
* the collection.
*/
public function __toString(): string
{
$node = \reset($this->collection);
if ($node instanceof AbstractNode) {
return (string) $node;
}
return '';
}
/**
* Returns the count of the collection.
*/
public function count(): int
{
return \count($this->collection);
}
/**
* Returns an iterator for the collection.
*/
public function getIterator(): ArrayIterator
{
return new ArrayIterator($this->collection);
}
/**
* Set an attribute by the given offset.
*
* @param mixed $offset
* @param mixed $value
*/
public function offsetSet($offset, $value): void
{
if (\is_null($offset)) {
$this->collection[] = $value;
} else {
$this->collection[$offset] = $value;
}
}
/**
* Checks if an offset exists.
*
* @param mixed $offset
*/
public function offsetExists($offset): bool
{
return isset($this->collection[$offset]);
}
/**
* Unset a collection Node.
*
* @param mixed $offset
*/
public function offsetUnset($offset): void
{
unset($this->collection[$offset]);
}
/**
* Gets a node at the given offset, or null.
*
* @param mixed $offset
*
* @return mixed
*/
public function offsetGet($offset)
{
return $this->collection[$offset] ?? null;
}
/**
* Returns this collection as an array.
*/
public function toArray(): array
{
return $this->collection;
}
/**
* Similar to jQuery "each" method. Calls the callback with each
* Node in this collection.
*/
public function each(callable $callback)
{
foreach ($this->collection as $key => $value) {
$callback($value, $key);
}
}
}
@@ -0,0 +1,244 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Dom\Node;
use PHPHtmlParser\Dom\Tag;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
use PHPHtmlParser\Exceptions\UnknownChildTypeException;
/**
* Class HtmlNode.
*
* @property-read string $outerhtml
* @property-read string $innerhtml
* @property-read string $innerText
* @property-read string $text
* @property-read Tag $tag
* @property-read InnerNode $parent
*/
class HtmlNode extends InnerNode
{
/**
* Remembers what the innerHtml was if it was scanned previously.
*
* @var ?string
*/
protected $innerHtml;
/**
* Remembers what the outerHtml was if it was scanned previously.
*
* @var ?string
*/
protected $outerHtml;
/**
* Remembers what the innerText was if it was scanned previously.
*
* @var ?string
*/
protected $innerText;
/**
* Remembers what the text was if it was scanned previously.
*
* @var ?string
*/
protected $text;
/**
* Remembers what the text was when we looked into all our
* children nodes.
*
* @var ?string
*/
protected $textWithChildren;
/**
* Sets up the tag of this node.
*
* @param string|Tag $tag
*/
public function __construct($tag)
{
if (!$tag instanceof Tag) {
$tag = new Tag($tag);
}
$this->tag = $tag;
parent::__construct();
}
/**
* @param bool $htmlSpecialCharsDecode
*/
public function setHtmlSpecialCharsDecode($htmlSpecialCharsDecode = false): void
{
parent::setHtmlSpecialCharsDecode($htmlSpecialCharsDecode);
$this->tag->setHtmlSpecialCharsDecode($htmlSpecialCharsDecode);
}
/**
* Gets the inner html of this node.
*
* @throws ChildNotFoundException
* @throws UnknownChildTypeException
*/
public function innerHtml(): string
{
if (!$this->hasChildren()) {
// no children
return '';
}
if ($this->innerHtml !== null) {
// we already know the result.
return $this->innerHtml;
}
$child = $this->firstChild();
$string = '';
// continue to loop until we are out of children
while ($child !== null) {
if ($child instanceof TextNode) {
$string .= $child->text();
} elseif ($child instanceof HtmlNode) {
$string .= $child->outerHtml();
} else {
throw new UnknownChildTypeException('Unknown child type "' . \get_class($child) . '" found in node');
}
try {
$child = $this->nextChild($child->id());
} catch (ChildNotFoundException $e) {
// no more children
unset($e);
$child = null;
}
}
// remember the results
$this->innerHtml = $string;
return $string;
}
/**
* Gets the inner text of this node.
*
* @throws ChildNotFoundException
* @throws UnknownChildTypeException
*/
public function innerText(): string
{
if (\is_null($this->innerText)) {
$this->innerText = \strip_tags($this->innerHtml());
}
return $this->innerText;
}
/**
* Gets the html of this node, including it's own
* tag.
*
* @throws ChildNotFoundException
* @throws UnknownChildTypeException
*/
public function outerHtml(): string
{
// special handling for root
if ($this->tag->name() == 'root') {
return $this->innerHtml();
}
if ($this->outerHtml !== null) {
// we already know the results.
return $this->outerHtml;
}
$return = $this->tag->makeOpeningTag();
if ($this->tag->isSelfClosing()) {
// ignore any children... there should not be any though
return $return;
}
// get the inner html
$return .= $this->innerHtml();
// add closing tag
$return .= $this->tag->makeClosingTag();
// remember the results
$this->outerHtml = $return;
return $return;
}
/**
* Gets the text of this node (if there is any text). Or get all the text
* in this node, including children.
*/
public function text(bool $lookInChildren = false): string
{
if ($lookInChildren) {
if ($this->textWithChildren !== null) {
// we already know the results.
return $this->textWithChildren;
}
} elseif ($this->text !== null) {
// we already know the results.
return $this->text;
}
// find out if this node has any text children
$text = '';
foreach ($this->children as $child) {
/** @var AbstractNode $node */
$node = $child['node'];
if ($node instanceof TextNode) {
$text .= $child['node']->text;
} elseif (
$lookInChildren &&
$node instanceof HtmlNode
) {
$text .= $node->text($lookInChildren);
}
}
// remember our result
if ($lookInChildren) {
$this->textWithChildren = $text;
} else {
$this->text = $text;
}
return $text;
}
/**
* Call this when something in the node tree has changed. Like a child has been added
* or a parent has been changed.
*/
protected function clear(): void
{
$this->innerHtml = null;
$this->outerHtml = null;
$this->text = null;
$this->textWithChildren = null;
if ($this->parent !== null) {
$this->parent->clear();
}
}
/**
* Returns all children of this html node.
*/
protected function getIteratorArray(): array
{
return $this->getChildren();
}
}
@@ -0,0 +1,442 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Dom\Node;
use PHPHtmlParser\Dom\Tag;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
use PHPHtmlParser\Exceptions\CircularException;
use PHPHtmlParser\Exceptions\LogicalException;
use stringEncode\Encode;
/**
* Inner node of the html tree, might have children.
*
* @property-read string $outerhtml
* @property-read string $innerhtml
* @property-read string $innerText
* @property-read string $text
* @property-read Tag $tag
* @property-read InnerNode $parent
*/
abstract class InnerNode extends ArrayNode
{
/**
* An array of all the children.
*
* @var array
*/
protected $children = [];
/**
* Sets the encoding class to this node and propagates it
* to all its children.
*/
public function propagateEncoding(Encode $encode): void
{
$this->encode = $encode;
$this->tag->setEncoding($encode);
// check children
foreach ($this->children as $child) {
/** @var AbstractNode $node */
$node = $child['node'];
$node->propagateEncoding($encode);
}
}
/**
* Checks if this node has children.
*/
public function hasChildren(): bool
{
return !empty($this->children);
}
/**
* Returns the child by id.
*
* @throws ChildNotFoundException
*/
public function getChild(int $id): AbstractNode
{
if (!isset($this->children[$id])) {
throw new ChildNotFoundException("Child '$id' not found in this node.");
}
return $this->children[$id]['node'];
}
/**
* Returns a new array of child nodes.
*/
public function getChildren(): array
{
$nodes = [];
$childrenIds = [];
try {
$child = $this->firstChild();
do {
$nodes[] = $child;
$childrenIds[] = $child->id;
$child = $this->nextChild($child->id());
if (\in_array($child->id, $childrenIds, true)) {
throw new CircularException('Circular sibling referance found. Child with id ' . $child->id() . ' found twice.');
}
} while (true);
} catch (ChildNotFoundException $e) {
// we are done looking for children
unset($e);
}
return $nodes;
}
/**
* Counts children.
*/
public function countChildren(): int
{
return \count($this->children);
}
/**
* Adds a child node to this node and returns the id of the child for this
* parent.
*
* @throws ChildNotFoundException
* @throws CircularException
* @throws LogicalException
*/
public function addChild(AbstractNode $child, int $before = -1): bool
{
$key = null;
// check integrity
if ($this->isAncestor($child->id())) {
throw new CircularException('Can not add child. It is my ancestor.');
}
// check if child is itself
if ($child->id() == $this->id) {
throw new CircularException('Can not set itself as a child.');
}
$next = null;
if ($this->hasChildren()) {
if (isset($this->children[$child->id()])) {
// we already have this child
return false;
}
if ($before >= 0) {
if (!isset($this->children[$before])) {
return false;
}
$key = $this->children[$before]['prev'];
if ($key) {
$this->children[$key]['next'] = $child->id();
}
$this->children[$before]['prev'] = $child->id();
$next = $before;
} else {
$sibling = $this->lastChild();
$key = $sibling->id();
$this->children[$key]['next'] = $child->id();
}
}
$keys = \array_keys($this->children);
$insert = [
'node' => $child,
'next' => $next,
'prev' => $key,
];
$index = $key ? (int) (\array_search($key, $keys, true) + 1) : 0;
\array_splice($keys, $index, 0, (string) $child->id());
$children = \array_values($this->children);
\array_splice($children, $index, 0, [$insert]);
// add the child
$combination = \array_combine($keys, $children);
if ($combination === false) {
// The number of elements for each array isn't equal or if the arrays are empty.
throw new LogicalException('array combine failed during add child method call.');
}
$this->children = $combination;
// tell child I am the new parent
$child->setParent($this);
//clear any cache
$this->clear();
return true;
}
/**
* Insert element before child with provided id.
*
* @throws ChildNotFoundException
* @throws CircularException
*/
public function insertBefore(AbstractNode $child, int $id): bool
{
return $this->addChild($child, $id);
}
/**
* Insert element before after with provided id.
*
* @throws ChildNotFoundException
* @throws CircularException
*/
public function insertAfter(AbstractNode $child, int $id): bool
{
if (!isset($this->children[$id])) {
return false;
}
if (isset($this->children[$id]['next']) && \is_int($this->children[$id]['next'])) {
return $this->addChild($child, (int) $this->children[$id]['next']);
}
// clear cache
$this->clear();
return $this->addChild($child);
}
/**
* Removes the child by id.
*/
public function removeChild(int $id): InnerNode
{
if (!isset($this->children[$id])) {
return $this;
}
// handle moving next and previous assignments.
$next = $this->children[$id]['next'];
$prev = $this->children[$id]['prev'];
if (!\is_null($next)) {
$this->children[$next]['prev'] = $prev;
}
if (!\is_null($prev)) {
$this->children[$prev]['next'] = $next;
}
// remove the child
unset($this->children[$id]);
//clear any cache
$this->clear();
return $this;
}
/**
* Check if has next Child.
*
* @throws ChildNotFoundException
*
* @return mixed
*/
public function hasNextChild(int $id)
{
$child = $this->getChild($id);
return $this->children[$child->id()]['next'];
}
/**
* Attempts to get the next child.
*
* @throws ChildNotFoundException
*
* @uses $this->getChild()
*/
public function nextChild(int $id): AbstractNode
{
$child = $this->getChild($id);
$next = $this->children[$child->id()]['next'];
if (\is_null($next) || !\is_int($next)) {
throw new ChildNotFoundException("Child '$id' next sibling not found in this node.");
}
return $this->getChild($next);
}
/**
* Attempts to get the previous child.
*
* @throws ChildNotFoundException
*
* @uses $this->getChild()
*/
public function previousChild(int $id): AbstractNode
{
$child = $this->getchild($id);
$next = $this->children[$child->id()]['prev'];
if (\is_null($next) || !\is_int($next)) {
throw new ChildNotFoundException("Child '$id' previous not found in this node.");
}
return $this->getChild($next);
}
/**
* Checks if the given node id is a child of the
* current node.
*/
public function isChild(int $id): bool
{
foreach (\array_keys($this->children) as $childId) {
if ($id == $childId) {
return true;
}
}
return false;
}
/**
* Removes the child with id $childId and replace it with the new child
* $newChild.
*
* @throws LogicalException
*/
public function replaceChild(int $childId, AbstractNode $newChild): void
{
$oldChild = $this->children[$childId];
$newChild->prev = (int) $oldChild['prev'];
$newChild->next = (int) $oldChild['next'];
$keys = \array_keys($this->children);
$index = \array_search($childId, $keys, true);
$keys[$index] = $newChild->id();
$combination = \array_combine($keys, $this->children);
if ($combination === false) {
// The number of elements for each array isn't equal or if the arrays are empty.
throw new LogicalException('array combine failed during replace child method call.');
}
$this->children = $combination;
$this->children[$newChild->id()] = [
'prev' => $oldChild['prev'],
'node' => $newChild,
'next' => $oldChild['next'],
];
// change previous child id to new child
if ($oldChild['prev'] && isset($this->children[$newChild->prev])) {
$this->children[$oldChild['prev']]['next'] = $newChild->id();
}
// change next child id to new child
if ($oldChild['next'] && isset($this->children[$newChild->next])) {
$this->children[$oldChild['next']]['prev'] = $newChild->id();
}
// remove old child
unset($this->children[$childId]);
// clean out cache
$this->clear();
}
/**
* Shortcut to return the first child.
*
* @throws ChildNotFoundException
*
* @uses $this->getChild()
*/
public function firstChild(): AbstractNode
{
if (\count($this->children) == 0) {
// no children
throw new ChildNotFoundException('No children found in node.');
}
\reset($this->children);
$key = (int) \key($this->children);
return $this->getChild($key);
}
/**
* Attempts to get the last child.
*
* @throws ChildNotFoundException
*
* @uses $this->getChild()
*/
public function lastChild(): AbstractNode
{
if (\count($this->children) == 0) {
// no children
throw new ChildNotFoundException('No children found in node.');
}
\end($this->children);
$key = \key($this->children);
if (!\is_int($key)) {
throw new LogicalException('Children array contain child with a key that is not an int.');
}
return $this->getChild($key);
}
/**
* Checks if the given node id is a descendant of the
* current node.
*/
public function isDescendant(int $id): bool
{
if ($this->isChild($id)) {
return true;
}
foreach ($this->children as $child) {
/** @var InnerNode $node */
$node = $child['node'];
if ($node instanceof InnerNode
&& $node->hasChildren()
&& $node->isDescendant($id)
) {
return true;
}
}
return false;
}
/**
* Sets the parent node.
*
* @throws ChildNotFoundException
* @throws CircularException
*/
public function setParent(InnerNode $parent): AbstractNode
{
// check integrity
if ($this->isDescendant($parent->id())) {
throw new CircularException('Can not add descendant "' . $parent->id() . '" as my parent.');
}
// clear cache
$this->clear();
return parent::setParent($parent);
}
}
@@ -0,0 +1,21 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Dom\Node;
use PHPHtmlParser\Dom\Tag;
/**
* Class LeafNode.
*
* @property-read string $outerhtml
* @property-read string $innerhtml
* @property-read string $innerText
* @property-read string $text
* @property-read Tag $tag
* @property-read InnerNode $parent
*/
abstract class LeafNode extends AbstractNode
{
}
@@ -0,0 +1,155 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Dom\Node;
use PHPHtmlParser\Dom\Tag;
use PHPHtmlParser\Exceptions\LogicalException;
/**
* Class TextNode.
*
* @property-read string $outerhtml
* @property-read string $innerhtml
* @property-read string $innerText
* @property-read string $text
* @property-read Tag $tag
* @property-read InnerNode $parent
*/
class TextNode extends LeafNode
{
/**
* This is a text node.
*
* @var Tag
*/
protected $tag;
/**
* This is the text in this node.
*
* @var string
*/
protected $text;
/**
* This is the converted version of the text.
*
* @var ?string
*/
protected $convertedText;
/**
* Sets the text for this node.
*
* @param bool $removeDoubleSpace
*/
public function __construct(string $text, $removeDoubleSpace = true)
{
if ($removeDoubleSpace) {
// remove double spaces
$replacedText = \mb_ereg_replace('\s+', ' ', $text);
if ($replacedText === false) {
throw new LogicalException('mb_ereg_replace returns false when attempting to clean white space from "' . $text . '".');
}
$text = $replacedText;
}
// restore line breaks
$text = \str_replace('&#10;', "\n", $text);
$this->text = $text;
$this->tag = new Tag('text');
parent::__construct();
}
/**
* @param bool $htmlSpecialCharsDecode
*/
public function setHtmlSpecialCharsDecode($htmlSpecialCharsDecode = false): void
{
parent::setHtmlSpecialCharsDecode($htmlSpecialCharsDecode);
$this->tag->setHtmlSpecialCharsDecode($htmlSpecialCharsDecode);
}
/**
* Returns the text of this node.
*/
public function text(): string
{
if ($this->htmlSpecialCharsDecode) {
$text = \htmlspecialchars_decode($this->text);
} else {
$text = $this->text;
}
// convert charset
if (!\is_null($this->encode)) {
if (!\is_null($this->convertedText)) {
// we already know the converted value
return $this->convertedText;
}
$text = $this->encode->convert($text);
// remember the conversion
$this->convertedText = $text;
return $text;
}
return $text;
}
/**
* Sets the text for this node.
*
* @var string
*/
public function setText(string $text): void
{
$this->text = $text;
if (!\is_null($this->encode)) {
$text = $this->encode->convert($text);
// remember the conversion
$this->convertedText = $text;
}
}
/**
* This node has no html, just return the text.
*
* @uses $this->text()
*/
public function innerHtml(): string
{
return $this->text();
}
/**
* This node has no html, just return the text.
*
* @uses $this->text()
*/
public function outerHtml(): string
{
return $this->text();
}
/**
* Checks if the current node is a text node.
*/
public function isTextNode(): bool
{
return true;
}
/**
* Call this when something in the node tree has changed. Like a child has been added
* or a parent has been changed.
*/
protected function clear(): void
{
$this->convertedText = null;
}
}
@@ -0,0 +1,348 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Dom;
use PHPHtmlParser\Content;
use PHPHtmlParser\Contracts\Dom\ParserInterface;
use PHPHtmlParser\Dom\Node\AbstractNode;
use PHPHtmlParser\Dom\Node\HtmlNode;
use PHPHtmlParser\Dom\Node\TextNode;
use PHPHtmlParser\DTO\TagDTO;
use PHPHtmlParser\Enum\StringToken;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
use PHPHtmlParser\Exceptions\CircularException;
use PHPHtmlParser\Exceptions\ContentLengthException;
use PHPHtmlParser\Exceptions\LogicalException;
use PHPHtmlParser\Exceptions\StrictException;
use PHPHtmlParser\Options;
use stringEncode\Encode;
class Parser implements ParserInterface
{
/**
* Attempts to parse the html in content.
*
* @throws ChildNotFoundException
* @throws CircularException
* @throws ContentLengthException
* @throws LogicalException
* @throws StrictException
*/
public function parse(Options $options, Content $content, int $size): AbstractNode
{
// add the root node
$root = new HtmlNode('root');
$root->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
$activeNode = $root;
while ($activeNode !== null) {
if ($activeNode && $activeNode->tag->name() === 'script'
&& $options->isCleanupInput() !== true
) {
$str = $content->copyUntil('</');
} else {
$str = $content->copyUntil('<');
}
if ($str == '') {
$tagDTO = $this->parseTag($options, $content, $size);
if (!$tagDTO->isStatus()) {
// we are done here
$activeNode = null;
continue;
}
// check if it was a closing tag
if ($tagDTO->isClosing()) {
$foundOpeningTag = true;
$originalNode = $activeNode;
while ($activeNode->getTag()->name() != $tagDTO->getTag()) {
$activeNode = $activeNode->getParent();
if ($activeNode === null) {
// we could not find opening tag
$activeNode = $originalNode;
$foundOpeningTag = false;
break;
}
}
if ($foundOpeningTag) {
$activeNode = $activeNode->getParent();
}
continue;
}
if ($tagDTO->getNode() === null) {
continue;
}
/** @var AbstractNode $node */
$node = $tagDTO->getNode();
$activeNode->addChild($node);
// check if node is self closing
if (!$node->getTag()->isSelfClosing()) {
$activeNode = $node;
}
} elseif ($options->isWhitespaceTextNode() ||
\trim($str) != ''
) {
// we found text we care about
$textNode = new TextNode($str, $options->isRemoveDoubleSpace());
$textNode->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
$activeNode->addChild($textNode);
}
}
return $root;
}
/**
* Attempts to detect the charset that the html was sent in.
*
* @throws ChildNotFoundException
*/
public function detectCharset(Options $options, string $defaultCharset, AbstractNode $root): bool
{
// set the default
$encode = new Encode();
$encode->from($defaultCharset);
$encode->to($defaultCharset);
$enforceEncoding = $options->getEnforceEncoding();
if ($enforceEncoding !== null) {
// they want to enforce the given encoding
$encode->from($enforceEncoding);
$encode->to($enforceEncoding);
return false;
}
/** @var AbstractNode $meta */
$meta = $root->find('meta[http-equiv=Content-Type]', 0);
if ($meta == null) {
if (!$this->detectHTML5Charset($encode, $root)) {
// could not find meta tag
$root->propagateEncoding($encode);
return false;
}
return true;
}
$content = $meta->getAttribute('content');
if (\is_null($content)) {
// could not find content
$root->propagateEncoding($encode);
return false;
}
$matches = [];
if (\preg_match('/charset=([^;]+)/', $content, $matches)) {
$encode->from(\trim($matches[1]));
$root->propagateEncoding($encode);
return true;
}
// no charset found
$root->propagateEncoding($encode);
return false;
}
/**
* Attempt to parse a tag out of the content.
*
* @throws StrictException
* @throws ContentLengthException
* @throws LogicalException
* @throws StrictException
*/
private function parseTag(Options $options, Content $content, int $size): TagDTO
{
if ($content->char() != '<') {
// we are not at the beginning of a tag
return TagDTO::makeFromPrimitives();
}
// check if this is a closing tag
try {
$content->fastForward(1);
} catch (ContentLengthException $exception) {
// we are at the end of the file
return TagDTO::makeFromPrimitives();
}
if ($content->char() == '/') {
return $this->makeEndTag($content, $options);
}
if ($content->char() == '?') {
// special setting tag
$tag = $content->fastForward(1)
->copyByToken(StringToken::SLASH(), true);
$tag = (new Tag($tag))
->setOpening('<?')
->setClosing(' ?>')
->selfClosing();
} elseif($content->string(3) == '!--') {
// comment tag
$tag = $content->fastForward(3)
->copyByToken(StringToken::CLOSECOMMENT(), true);
$tag = (new Tag($tag))
->setOpening('<!--')
->setClosing('-->')
->selfClosing();
} else {
$tag = \strtolower($content->copyByToken(StringToken::SLASH(), true));
if (\trim($tag) == '') {
// no tag found, invalid < found
return TagDTO::makeFromPrimitives();
}
}
$node = new HtmlNode($tag);
$node->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
$this->setUpAttributes($content, $size, $node, $options, $tag);
$content->skipByToken(StringToken::BLANK());
if ($content->char() == '/') {
// self closing tag
$node->getTag()->selfClosing();
$content->fastForward(1);
} elseif (\in_array($node->getTag()->name(), $options->getSelfClosing(), true)) {
// Should be a self closing tag, check if we are strict
if ($options->isStrict()) {
$character = $content->getPosition();
throw new StrictException("Tag '" . $node->getTag()->name() . "' is not self closing! (character #$character)");
}
// We force self closing on this tag.
$node->getTag()->selfClosing();
// Should this tag use a trailing slash?
if (\in_array($node->getTag()->name(), $options->getNoSlash(), true)) {
$node->getTag()->noTrailingSlash();
}
}
if ($content->canFastForward(1)) {
$content->fastForward(1);
}
return TagDTO::makeFromPrimitives(true, false, $node);
}
/**
* @throws ChildNotFoundException
*/
private function detectHTML5Charset(Encode $encode, AbstractNode $root): bool
{
/** @var AbstractNode|null $meta */
$meta = $root->find('meta[charset]', 0);
if ($meta == null) {
return false;
}
$encode->from(\trim($meta->getAttribute('charset')));
$root->propagateEncoding($encode);
return true;
}
/**
* @throws ContentLengthException
* @throws LogicalException
*/
private function makeEndTag(Content $content, Options $options): TagDTO
{
$tag = $content->fastForward(1)
->copyByToken(StringToken::SLASH(), true);
// move to end of tag
$content->copyUntil('>');
$content->fastForward(1);
// check if this closing tag counts
$tag = \strtolower($tag);
if (\in_array($tag, $options->getSelfClosing(), true)) {
return TagDTO::makeFromPrimitives(true);
}
return TagDTO::makeFromPrimitives(true, true, null, \strtolower($tag));
}
/**
* @param string|Tag $tag
*
* @throws ContentLengthException
* @throws LogicalException
* @throws StrictException
*/
private function setUpAttributes(Content $content, int $size, HtmlNode $node, Options $options, $tag): void
{
while (
$content->char() != '>' &&
$content->char() != '/'
) {
$space = $content->skipByToken(StringToken::BLANK(), true);
if (empty($space)) {
try {
$content->fastForward(1);
} catch (ContentLengthException $exception) {
// reached the end of the content
break;
}
continue;
}
$name = $content->copyByToken(StringToken::EQUAL(), true);
if ($name == '/') {
break;
}
if (empty($name)) {
$content->skipByToken(StringToken::BLANK());
continue;
}
$content->skipByToken(StringToken::BLANK());
if ($content->char() == '=') {
$content->fastForward(1)
->skipByToken(StringToken::BLANK());
switch ($content->char()) {
case '"':
$content->fastForward(1);
$string = $content->copyUntil('"', true);
do {
$moreString = $content->copyUntilUnless('"', '=>');
$string .= $moreString;
} while (\strlen($moreString) > 0 && $content->getPosition() < $size);
$content->fastForward(1);
$node->getTag()->setAttribute($name, $string);
break;
case "'":
$content->fastForward(1);
$string = $content->copyUntil("'", true);
do {
$moreString = $content->copyUntilUnless("'", '=>');
$string .= $moreString;
} while (\strlen($moreString) > 0 && $content->getPosition() < $size);
$content->fastForward(1);
$node->getTag()->setAttribute($name, $string, false);
break;
default:
$node->getTag()->setAttribute($name, $content->copyByToken(StringToken::ATTR(), true));
break;
}
} else {
// no value attribute
if ($options->isStrict()) {
// can't have this in strict html
$character = $content->getPosition();
throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
}
$node->getTag()->setAttribute($name, null);
if ($content->char() != '>') {
$content->rewind(1);
}
}
}
}
}
@@ -0,0 +1,100 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Dom;
use PHPHtmlParser\Dom\Node\AbstractNode;
use PHPHtmlParser\Dom\Node\HtmlNode;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
use PHPHtmlParser\Exceptions\NotLoadedException;
trait RootAccessTrait
{
/**
* Contains the root node of this dom tree.
*
* @var HtmlNode
*/
public $root;
/**
* A simple wrapper around the root node.
*
* @param string $name
*
* @throws NotLoadedException
*
* @return mixed
*/
public function __get($name)
{
$this->isLoaded();
return $this->root->$name;
}
/**
* Simple wrapper function that returns the first child.
*
* @throws ChildNotFoundException
* @throws NotLoadedException
*/
public function firstChild(): AbstractNode
{
$this->isLoaded();
return $this->root->firstChild();
}
/**
* Simple wrapper function that returns the last child.
*
* @throws ChildNotFoundException
* @throws NotLoadedException
*/
public function lastChild(): AbstractNode
{
$this->isLoaded();
return $this->root->lastChild();
}
/**
* Simple wrapper function that returns count of child elements.
*
* @throws NotLoadedException
*/
public function countChildren(): int
{
$this->isLoaded();
return $this->root->countChildren();
}
/**
* Get array of children.
*
* @throws NotLoadedException
*/
public function getChildren(): array
{
$this->isLoaded();
return $this->root->getChildren();
}
/**
* Check if node have children nodes.
*
* @throws NotLoadedException
*/
public function hasChildren(): bool
{
$this->isLoaded();
return $this->root->hasChildren();
}
abstract public function isLoaded(): void;
}
@@ -0,0 +1,365 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Dom;
use PHPHtmlParser\DTO\Tag\AttributeDTO;
use PHPHtmlParser\Exceptions\Tag\AttributeNotFoundException;
use stringEncode\Encode;
/**
* Class Tag.
*/
class Tag
{
/**
* The name of the tag.
*
* @var string
*/
protected $name;
/**
* The attributes of the tag.
*
* @var AttributeDTO[]
*/
protected $attr = [];
/**
* Is this tag self closing.
*
* @var bool
*/
protected $selfClosing = false;
/**
* If self-closing, will this use a trailing slash. />.
*
* @var bool
*/
protected $trailingSlash = true;
/**
* Tag noise.
*/
protected $noise = '';
/**
* The encoding class to... encode the tags.
*
* @var Encode|null
*/
protected $encode;
/**
* @var bool
*/
private $HtmlSpecialCharsDecode = false;
/**
* What the opening of this tag will be.
*
* @var string
*/
private $opening = '<';
/**
* What the closing tag for self-closing elements should be.
*
* @var string
*/
private $closing = ' />';
/**
* Sets up the tag with a name.
*
* @param $name
*/
public function __construct(string $name)
{
$this->name = $name;
}
/**
* Returns the name of this tag.
*/
public function name(): string
{
return $this->name;
}
/**
* Sets the tag to be self closing.
*/
public function selfClosing(): Tag
{
$this->selfClosing = true;
return clone $this;
}
public function setOpening(string $opening): Tag
{
$this->opening = $opening;
return clone $this;
}
public function setClosing(string $closing): Tag
{
$this->closing = $closing;
return clone $this;
}
/**
* Sets the tag to not use a trailing slash.
*/
public function noTrailingSlash(): Tag
{
$this->trailingSlash = false;
return clone $this;
}
/**
* Checks if the tag is self closing.
*/
public function isSelfClosing(): bool
{
return $this->selfClosing;
}
/**
* Sets the encoding type to be used.
*/
public function setEncoding(Encode $encode): void
{
$this->encode = $encode;
}
/**
* @param bool $htmlSpecialCharsDecode
*/
public function setHtmlSpecialCharsDecode($htmlSpecialCharsDecode = false): void
{
$this->HtmlSpecialCharsDecode = $htmlSpecialCharsDecode;
}
/**
* Sets the noise for this tag (if any).
*/
public function noise(string $noise): Tag
{
$this->noise = $noise;
return clone $this;
}
/**
* Set an attribute for this tag.
*/
public function setAttribute(string $key, ?string $attributeValue, bool $doubleQuote = true): Tag
{
$attributeDTO = AttributeDTO::makeFromPrimitives(
$attributeValue,
$doubleQuote
);
if ($this->HtmlSpecialCharsDecode) {
$attributeDTO->htmlspecialcharsDecode();
}
$this->attr[\strtolower($key)] = $attributeDTO;
return clone $this;
}
/**
* Set inline style attribute value.
*
* @param mixed $attr_key
* @param mixed $attr_value
*/
public function setStyleAttributeValue($attr_key, $attr_value): void
{
$style_array = $this->getStyleAttributeArray();
$style_array[$attr_key] = $attr_value;
$style_string = '';
foreach ($style_array as $key => $value) {
$style_string .= $key . ':' . $value . ';';
}
$this->setAttribute('style', $style_string);
}
/**
* Get style attribute in array.
*/
public function getStyleAttributeArray(): array
{
try {
$value = $this->getAttribute('style')->getValue();
if (\is_null($value)) {
return [];
}
$value = \explode(';', \substr(\trim($value), 0, -1));
$result = [];
foreach ($value as $attr) {
$attr = \explode(':', $attr);
$result[$attr[0]] = $attr[1];
}
return $result;
} catch (AttributeNotFoundException $e) {
unset($e);
return [];
}
}
/**
* Removes an attribute from this tag.
*
* @param mixed $key
*
* @return void
*/
public function removeAttribute($key)
{
$key = \strtolower($key);
unset($this->attr[$key]);
}
/**
* Removes all attributes on this tag.
*
* @return void
*/
public function removeAllAttributes()
{
$this->attr = [];
}
/**
* Sets the attributes for this tag.
*
* @return $this
*/
public function setAttributes(array $attr)
{
foreach ($attr as $key => $info) {
if (\is_array($info)) {
$this->setAttribute($key, $info['value'], $info['doubleQuote']);
} else {
$this->setAttribute($key, $info);
}
}
return $this;
}
/**
* Returns all attributes of this tag.
*
* @throws \stringEncode\Exception
*
* @return AttributeDTO[]
*/
public function getAttributes(): array
{
$return = [];
foreach (\array_keys($this->attr) as $attr) {
try {
$return[$attr] = $this->getAttribute($attr);
} catch (AttributeNotFoundException $e) {
// attribute that was in the array was not found in the array....
unset($e);
}
}
return $return;
}
/**
* Returns an attribute by the key.
*
* @throws AttributeNotFoundException
* @throws \stringEncode\Exception
*/
public function getAttribute(string $key): AttributeDTO
{
$key = \strtolower($key);
if (!isset($this->attr[$key])) {
throw new AttributeNotFoundException('Attribute with key "' . $key . '" not found.');
}
$attributeDTO = $this->attr[$key];
if (!\is_null($this->encode)) {
// convert charset
$attributeDTO->encodeValue($this->encode);
}
return $attributeDTO;
}
/**
* Returns TRUE if node has attribute.
*
* @return bool
*/
public function hasAttribute(string $key)
{
return isset($this->attr[$key]);
}
/**
* Generates the opening tag for this object.
*
* @return string
*/
public function makeOpeningTag()
{
$return = $this->opening . $this->name;
// add the attributes
foreach (\array_keys($this->attr) as $key) {
try {
$attributeDTO = $this->getAttribute($key);
} catch (AttributeNotFoundException $e) {
// attribute that was in the array not found in the array... let's continue.
continue;
} catch (\TypeError $e) {
$val = null;
}
$val = $attributeDTO->getValue();
if (\is_null($val)) {
$return .= ' ' . $key;
} elseif ($attributeDTO->isDoubleQuote()) {
$return .= ' ' . $key . '="' . $val . '"';
} else {
$return .= ' ' . $key . '=\'' . $val . '\'';
}
}
if ($this->selfClosing && $this->trailingSlash) {
return $return . $this->closing;
}
return $return . '>';
}
/**
* Generates the closing tag for this object.
*
* @return string
*/
public function makeClosingTag()
{
if ($this->selfClosing) {
return '';
}
return '</' . $this->name . '>';
}
}
@@ -0,0 +1,23 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Enum;
use MyCLabs\Enum\Enum;
/**
* @method static StringToken BLANK()
* @method static StringToken EQUAL()
* @method static StringToken SLASH()
* @method static StringToken ATTR()
* @method static StringToken CLOSECOMMENT()
*/
class StringToken extends Enum
{
private const BLANK = " \t\r\n";
private const EQUAL = ' =/>';
private const SLASH = " />\r\n\t";
private const ATTR = ' >';
private const CLOSECOMMENT = '-->';
}
@@ -0,0 +1,14 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Exceptions;
use Exception;
/**
* Class ChildNotFoundException.
*/
final class ChildNotFoundException extends Exception
{
}
@@ -0,0 +1,14 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Exceptions;
use Exception;
/**
* Class CircularException.
*/
final class CircularException extends Exception
{
}
@@ -0,0 +1,14 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Exceptions;
use Exception;
/**
* Class EmptyCollectionException.
*/
final class ContentLengthException extends Exception
{
}
@@ -0,0 +1,14 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Exceptions;
use Exception;
/**
* Class CurlException.
*/
class CurlException extends Exception
{
}
@@ -0,0 +1,14 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Exceptions;
use Exception;
/**
* Class EmptyCollectionException.
*/
final class EmptyCollectionException extends Exception
{
}
@@ -0,0 +1,14 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Exceptions;
use Exception;
/**
* Class EmptyCollectionException.
*/
final class LogicalException extends Exception
{
}
@@ -0,0 +1,14 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Exceptions;
use Exception;
/**
* Class NotLoadedException.
*/
final class NotLoadedException extends Exception
{
}
@@ -0,0 +1,14 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Exceptions;
use Exception;
/**
* Class ParentNotFoundException.
*/
final class ParentNotFoundException extends Exception
{
}
@@ -0,0 +1,14 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Exceptions;
use Exception;
/**
* Class StrictException.
*/
final class StrictException extends Exception
{
}
@@ -0,0 +1,12 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Exceptions\Tag;
/**
* Class AttributeNotFoundException.
*/
class AttributeNotFoundException extends \Exception
{
}
@@ -0,0 +1,14 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Exceptions;
use Exception;
/**
* Class UnknownChildTypeException.
*/
final class UnknownChildTypeException extends Exception
{
}
@@ -0,0 +1,14 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Exceptions;
use Exception;
/**
* Class UnknownOptionException.
*/
final class UnknownOptionException extends Exception
{
}
@@ -0,0 +1,64 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser;
use PHPHtmlParser\Dom\Node\AbstractNode;
use PHPHtmlParser\Dom\Node\InnerNode;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
use PHPHtmlParser\Exceptions\ParentNotFoundException;
class Finder
{
/**
* @var int
*/
private $id;
/**
* Finder constructor.
*
* @param $id
*/
public function __construct($id)
{
$this->id = $id;
}
/**
* Find node in tree by id.
*
* @throws ChildNotFoundException
* @throws ParentNotFoundException
*
* @return bool|AbstractNode
*/
public function find(AbstractNode $node)
{
if (!$node->id() && $node instanceof InnerNode) {
return $this->find($node->firstChild());
}
if ($node->id() == $this->id) {
return $node;
}
if ($node->hasNextSibling()) {
$nextSibling = $node->nextSibling();
if ($nextSibling->id() == $this->id) {
return $nextSibling;
}
if ($nextSibling->id() > $this->id && $node instanceof InnerNode) {
return $this->find($node->firstChild());
}
if ($nextSibling->id() < $this->id) {
return $this->find($nextSibling);
}
} elseif (!$node->isTextNode() && $node instanceof InnerNode) {
return $this->find($node->firstChild());
}
return false;
}
}
@@ -0,0 +1,367 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser;
class Options
{
/**
* The whitespaceTextNode, by default true, option tells the parser to save textnodes even if the content of the
* node is empty (only whitespace). Setting it to false will ignore all whitespace only text node found in the document.
*
* @var bool
*/
private $whitespaceTextNode = true;
/**
* Strict, by default false, will throw a StrictException if it finds that the html is not strictly compliant
* (all tags must have a closing tag, no attribute with out a value, etc.).
*
* @var bool
*/
private $strict = false;
/**
* The enforceEncoding, by default null, option will enforce an character set to be used for reading the content
* and returning the content in that encoding. Setting it to null will trigger an attempt to figure out
* the encoding from within the content of the string given instead.
*
* @var ?string
*/
private $enforceEncoding;
/**
* Set this to false to skip the entire clean up phase of the parser. Defaults to true.
*
* @var bool
*/
private $cleanupInput = true;
/**
* Set this to false to skip removing the script tags from the document body. This might have adverse effects.
* Defaults to true.
*
* NOTE: Ignored if cleanupInit is true.
*
* @var bool
*/
private $removeScripts = true;
/**
* Set this to false to skip removing of style tags from the document body. This might have adverse effects. Defaults to true.
*
* NOTE: Ignored if cleanupInit is true.
*
* @var bool
*/
private $removeStyles = true;
/**
* Preserves Line Breaks if set to true. If set to false line breaks are cleaned up
* as part of the input clean up process. Defaults to false.
*
* NOTE: Ignored if cleanupInit is true.
*
* @var bool
*/
private $preserveLineBreaks = false;
/**
* Set this to false if you want to preserve whitespace inside of text nodes. It is set to true by default.
*
* @var bool
*/
private $removeDoubleSpace = true;
/**
* Set this to false if you want to preserve smarty script found in the html content. It is set to true by default.
*
* @var bool
*/
private $removeSmartyScripts = true;
/**
* By default this is set to false. Setting this to true will apply the php function htmlspecialchars_decode too all attribute values and text nodes.
*
* @var bool
*/
private $htmlSpecialCharsDecode = false;
/**
* A list of tags which will always be self closing.
*
* @var string[]
*/
private $selfClosing = [
'area',
'base',
'basefont',
'br',
'col',
'embed',
'hr',
'img',
'input',
'keygen',
'link',
'meta',
'param',
'source',
'spacer',
'track',
'wbr',
];
/**
* A list of tags where there should be no /> at the end (html5 style).
*
* @var string[]
*/
private $noSlash = [];
public function isWhitespaceTextNode(): bool
{
return $this->whitespaceTextNode;
}
public function setWhitespaceTextNode(bool $whitespaceTextNode): Options
{
$this->whitespaceTextNode = $whitespaceTextNode;
return clone $this;
}
public function isStrict(): bool
{
return $this->strict;
}
public function setStrict(bool $strict): Options
{
$this->strict = $strict;
return clone $this;
}
public function getEnforceEncoding(): ?string
{
return $this->enforceEncoding;
}
public function setEnforceEncoding(?string $enforceEncoding): Options
{
$this->enforceEncoding = $enforceEncoding;
return clone $this;
}
public function isCleanupInput(): bool
{
return $this->cleanupInput;
}
public function setCleanupInput(bool $cleanupInput): Options
{
$this->cleanupInput = $cleanupInput;
return clone $this;
}
public function isRemoveScripts(): bool
{
return $this->removeScripts;
}
public function setRemoveScripts(bool $removeScripts): Options
{
$this->removeScripts = $removeScripts;
return clone $this;
}
public function isRemoveStyles(): bool
{
return $this->removeStyles;
}
public function setRemoveStyles(bool $removeStyles): Options
{
$this->removeStyles = $removeStyles;
return clone $this;
}
public function isPreserveLineBreaks(): bool
{
return $this->preserveLineBreaks;
}
public function setPreserveLineBreaks(bool $preserveLineBreaks): Options
{
$this->preserveLineBreaks = $preserveLineBreaks;
return clone $this;
}
public function isRemoveDoubleSpace(): bool
{
return $this->removeDoubleSpace;
}
public function setRemoveDoubleSpace(bool $removeDoubleSpace): Options
{
$this->removeDoubleSpace = $removeDoubleSpace;
return clone $this;
}
public function isRemoveSmartyScripts(): bool
{
return $this->removeSmartyScripts;
}
public function setRemoveSmartyScripts(bool $removeSmartyScripts): Options
{
$this->removeSmartyScripts = $removeSmartyScripts;
return clone $this;
}
public function isHtmlSpecialCharsDecode(): bool
{
return $this->htmlSpecialCharsDecode;
}
public function setHtmlSpecialCharsDecode(bool $htmlSpecialCharsDecode): Options
{
$this->htmlSpecialCharsDecode = $htmlSpecialCharsDecode;
return clone $this;
}
/**
* @return string[]
*/
public function getSelfClosing(): array
{
return $this->selfClosing;
}
public function setSelfClosing(array $selfClosing): Options
{
$this->selfClosing = $selfClosing;
return clone $this;
}
/**
* Adds the tag to the list of tags that will always be self closing.
*/
public function addSelfClosingTag(string $tag): Options
{
$this->selfClosing[] = $tag;
return clone $this;
}
/**
* Adds the tags to the list of tags that will always be self closing.
*
* @param string[] $tags
*/
public function addSelfClosingTags(array $tags): Options
{
foreach ($tags as $tag) {
$this->selfClosing[] = $tag;
}
return clone $this;
}
/**
* Removes the tag from the list of tags that will always be self closing.
*/
public function removeSelfClosingTag(string $tag): Options
{
$tags = [$tag];
$this->selfClosing = \array_diff($this->selfClosing, $tags);
return clone $this;
}
/**
* Sets the list of self closing tags to empty.
*/
public function clearSelfClosingTags(): Options
{
$this->selfClosing = [];
return clone $this;
}
/**
* @return string[]
*/
public function getNoSlash(): array
{
return $this->noSlash;
}
/**
* @param string[] $noSlash
*/
public function setNoSlash(array $noSlash): Options
{
$this->noSlash = $noSlash;
return clone $this;
}
/**
* Adds a tag to the list of self closing tags that should not have a trailing slash.
*/
public function addNoSlashTag(string $tag): Options
{
$this->noSlash[] = $tag;
return clone $this;
}
/**
* Removes a tag from the list of no-slash tags.
*/
public function removeNoSlashTag(string $tag): Options
{
$tags = [$tag];
$this->noSlash = \array_diff($this->noSlash, $tags);
return clone $this;
}
/**
* Empties the list of no-slash tags.
*/
public function clearNoSlashTags(): Options
{
$this->noSlash = [];
return clone $this;
}
public function setFromOptions(Options $options): Options
{
return $this->setCleanupInput($options->isCleanupInput())
->setEnforceEncoding($options->getEnforceEncoding())
->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode())
->setPreserveLineBreaks($options->isPreserveLineBreaks())
->setRemoveDoubleSpace($options->isRemoveDoubleSpace())
->setRemoveScripts($options->isRemoveScripts())
->setRemoveSmartyScripts($options->isRemoveSmartyScripts())
->setRemoveStyles($options->isRemoveStyles())
->setStrict($options->isStrict())
->setWhitespaceTextNode($options->isWhitespaceTextNode())
->setSelfClosing($options->getSelfClosing())
->setNoSlash($options->getNoSlash());
}
}
@@ -0,0 +1,116 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Selector;
use PHPHtmlParser\Contracts\Selector\ParserInterface;
use PHPHtmlParser\DTO\Selector\ParsedSelectorCollectionDTO;
use PHPHtmlParser\DTO\Selector\ParsedSelectorDTO;
use PHPHtmlParser\DTO\Selector\RuleDTO;
/**
* This is the default parser for the selector.
*/
class Parser implements ParserInterface
{
/**
* Pattern of CSS selectors, modified from 'mootools'.
*
* @var string
*/
private $pattern = "/([\w\-:\*>]*)(?:\#([\w\-]+)|\.([\w\.\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
/**
* Parses the selector string.
*/
public function parseSelectorString(string $selector): ParsedSelectorCollectionDTO
{
$selectors = [];
$matches = [];
$rules = [];
\preg_match_all($this->pattern, \trim($selector) . ' ', $matches, PREG_SET_ORDER);
// skip tbody
foreach ($matches as $match) {
// default values
$tag = \strtolower(\trim($match[1]));
$operator = '=';
$key = null;
$value = null;
$noKey = false;
$alterNext = false;
// check for elements that alter the behavior of the next element
if ($tag == '>') {
$alterNext = true;
}
// check for id selector
if (!empty($match[2])) {
$key = 'id';
$value = $match[2];
}
// check for class selector
if (!empty($match[3])) {
$key = 'class';
$value = \explode('.', $match[3]);
}
// and final attribute selector
if (!empty($match[4])) {
$key = \strtolower($match[4]);
}
if (!empty($match[5])) {
$operator = $match[5];
}
if (!empty($match[6])) {
$value = $match[6];
if (\strpos($value, '][') !== false) {
// we have multiple type selectors
$keys = [];
$keys[] = $key;
$key = $keys;
$parts = \explode('][', $value);
$value = [];
foreach ($parts as $part) {
if (\strpos($part, '=') !== false) {
list($first, $second) = \explode('=', $part);
$key[] = $first;
$value[] = $second;
} else {
$value[] = $part;
}
}
}
}
// check for elements that do not have a specified attribute
if (\is_string($key) && isset($key[0]) && $key[0] == '!') {
$key = \substr($key, 1);
$noKey = true;
}
$rules[] = RuleDTO::makeFromPrimitives(
$tag,
$operator,
$key,
$value,
$noKey,
$alterNext
);
if (isset($match[7]) && \is_string($match[7]) && \trim($match[7]) == ',') {
$selectors[] = ParsedSelectorDTO::makeFromRules($rules);
$rules = [];
}
}
// save last results
if (\count($rules) > 0) {
$selectors[] = ParsedSelectorDTO::makeFromRules($rules);
}
return ParsedSelectorCollectionDTO::makeCollection($selectors);
}
}
@@ -0,0 +1,316 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Selector;
use PHPHtmlParser\Contracts\Selector\SeekerInterface;
use PHPHtmlParser\Dom\Node\AbstractNode;
use PHPHtmlParser\Dom\Node\InnerNode;
use PHPHtmlParser\Dom\Node\LeafNode;
use PHPHtmlParser\DTO\Selector\RuleDTO;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
class Seeker implements SeekerInterface
{
/**
* Attempts to find all children that match the rule
* given.
*
* @var InnerNode[]
*
* @throws ChildNotFoundException
*/
public function seek(array $nodes, RuleDTO $rule, array $options): array
{
// XPath index
if ($rule->getTag() !== null && \is_numeric($rule->getKey())) {
$count = 0;
foreach ($nodes as $node) {
if ($rule->getTag() == '*'
|| $rule->getTag() == $node->getTag()
->name()
) {
++$count;
if ($count == $rule->getKey()) {
// found the node we wanted
return [$node];
}
}
}
return [];
}
$options = $this->flattenOptions($options);
$return = [];
foreach ($nodes as $node) {
// check if we are a leaf
if ($node instanceof LeafNode || !$node->hasChildren()
) {
continue;
}
$children = [];
$child = $node->firstChild();
while (!\is_null($child)) {
// wild card, grab all
if ($rule->getTag() == '*' && \is_null($rule->getKey())) {
$return[] = $child;
$child = $this->getNextChild($node, $child);
continue;
}
$pass = $this->checkTag($rule, $child);
if ($pass && $rule->getKey() !== null) {
$pass = $this->checkKey($rule, $child);
}
if ($pass &&
$rule->getKey() !== null &&
$rule->getValue() !== null &&
$rule->getValue() != '*'
) {
$pass = $this->checkComparison($rule, $child);
}
if ($pass) {
// it passed all checks
$return[] = $child;
}
// this child failed to be matched
if ($child instanceof InnerNode && $child->hasChildren()
) {
if (!isset($options['checkGrandChildren'])
|| $options['checkGrandChildren']
) {
// we have a child that failed but are not leaves.
$matches = $this->seek([$child], $rule, $options);
foreach ($matches as $match) {
$return[] = $match;
}
}
}
$child = $this->getNextChild($node, $child);
}
if ((!isset($options['checkGrandChildren'])
|| $options['checkGrandChildren'])
&& \count($children) > 0
) {
// we have children that failed but are not leaves.
$matches = $this->seek($children, $rule, $options);
foreach ($matches as $match) {
$return[] = $match;
}
}
}
return $return;
}
/**
* Checks comparison condition from rules against node.
*/
private function checkComparison(RuleDTO $rule, AbstractNode $node): bool
{
if ($rule->getKey() == 'plaintext') {
// plaintext search
$nodeValue = $node->text();
$result = $this->checkNodeValue($nodeValue, $rule, $node);
} else {
// normal search
if (!\is_array($rule->getKey())) {
$nodeValue = $node->getAttribute($rule->getKey());
$result = $this->checkNodeValue($nodeValue, $rule, $node);
} else {
$result = true;
foreach ($rule->getKey() as $index => $key) {
$nodeValue = $node->getAttribute($key);
$result = $result &&
$this->checkNodeValue($nodeValue, $rule, $node, $index);
}
}
}
return $result;
}
/**
* Flattens the option array.
*
* @return array
*/
private function flattenOptions(array $optionsArray)
{
$options = [];
foreach ($optionsArray as $optionArray) {
foreach ($optionArray as $key => $option) {
$options[$key] = $option;
}
}
return $options;
}
/**
* Returns the next child or null if no more children.
*
* @return AbstractNode|null
*/
private function getNextChild(
AbstractNode $node,
AbstractNode $currentChild
) {
try {
$child = null;
if ($node instanceof InnerNode) {
// get next child
$child = $node->nextChild($currentChild->id());
}
} catch (ChildNotFoundException $e) {
// no more children
unset($e);
$child = null;
}
return $child;
}
/**
* Checks tag condition from rules against node.
*/
private function checkTag(RuleDTO $rule, AbstractNode $node): bool
{
if (!empty($rule->getTag()) && $rule->getTag() != $node->getTag()->name()
&& $rule->getTag() != '*'
) {
return false;
}
return true;
}
/**
* Checks key condition from rules against node.
*/
private function checkKey(RuleDTO $rule, AbstractNode $node): bool
{
if (!\is_array($rule->getKey())) {
if ($rule->isNoKey()) {
if ($node->getAttribute($rule->getKey()) !== null) {
return false;
}
} else {
if ($rule->getKey() != 'plaintext'
&& !$node->hasAttribute($rule->getKey())
) {
return false;
}
}
} else {
if ($rule->isNoKey()) {
foreach ($rule->getKey() as $key) {
if (!\is_null($node->getAttribute($key))) {
return false;
}
}
} else {
foreach ($rule->getKey() as $key) {
if ($key != 'plaintext'
&& !$node->hasAttribute($key)
) {
return false;
}
}
}
}
return true;
}
private function checkNodeValue(
?string $nodeValue,
RuleDTO $rule,
AbstractNode $node,
?int $index = null
): bool {
$check = false;
if (
$rule->getValue() !== null &&
\is_string($rule->getValue()) &&
$nodeValue !== null
) {
$check = $this->match($rule->getOperator(), $rule->getValue(), $nodeValue);
}
// handle multiple classes
$key = $rule->getKey();
if (
!$check &&
$key == 'class' &&
\is_array($rule->getValue())
) {
$nodeClasses = \explode(' ', $node->getAttribute('class') ?? '');
foreach ($rule->getValue() as $value) {
foreach ($nodeClasses as $class) {
if (
!empty($class) &&
\is_string($rule->getOperator())
) {
$check = $this->match($rule->getOperator(), $value, $class);
}
if ($check) {
break;
}
}
if (!$check) {
break;
}
}
} elseif (
!$check &&
\is_array($key) &&
!\is_null($nodeValue) &&
\is_string($rule->getOperator()) &&
\is_string($rule->getValue()[$index])
) {
$check = $this->match($rule->getOperator(), $rule->getValue()[$index], $nodeValue);
}
return $check;
}
/**
* Attempts to match the given arguments with the given operator.
*/
private function match(
string $operator,
string $pattern,
string $value
): bool {
$value = \strtolower($value);
$pattern = \strtolower($pattern);
switch ($operator) {
case '=':
return $value === $pattern;
case '!=':
return $value !== $pattern;
case '^=':
return \preg_match('/^' . \preg_quote($pattern, '/') . '/',
$value) == 1;
case '$=':
return \preg_match('/' . \preg_quote($pattern, '/') . '$/',
$value) == 1;
case '*=':
if ($pattern[0] == '/') {
return \preg_match($pattern, $value) == 1;
}
return \preg_match('/' . $pattern . '/i', $value) == 1;
default:
return false;
}
}
}
@@ -0,0 +1,105 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Selector;
use PHPHtmlParser\Contracts\Selector\ParserInterface;
use PHPHtmlParser\Contracts\Selector\SeekerInterface;
use PHPHtmlParser\Contracts\Selector\SelectorInterface;
use PHPHtmlParser\Discovery\SeekerDiscovery;
use PHPHtmlParser\Discovery\SelectorParserDiscovery;
use PHPHtmlParser\Dom\Node\AbstractNode;
use PHPHtmlParser\Dom\Node\Collection;
use PHPHtmlParser\DTO\Selector\ParsedSelectorCollectionDTO;
use PHPHtmlParser\DTO\Selector\RuleDTO;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
/**
* Class Selector.
*/
class Selector implements SelectorInterface
{
/**
* @var ParsedSelectorCollectionDTO
*/
private $ParsedSelectorCollectionDTO;
/**
* @var SeekerInterface
*/
private $seeker;
/**
* Constructs with the selector string.
*/
public function __construct(string $selector, ?ParserInterface $parser = null, ?SeekerInterface $seeker = null)
{
if ($parser == null) {
$parser = SelectorParserDiscovery::find();
}
if ($seeker == null) {
$seeker = SeekerDiscovery::find();
}
$this->ParsedSelectorCollectionDTO = $parser->parseSelectorString($selector);
$this->seeker = $seeker;
}
/**
* Returns the selectors that where found in __construct.
*/
public function getParsedSelectorCollectionDTO(): ParsedSelectorCollectionDTO
{
return $this->ParsedSelectorCollectionDTO;
}
/**
* Attempts to find the selectors starting from the given
* node object.
*
* @throws ChildNotFoundException
*/
public function find(AbstractNode $node): Collection
{
$results = new Collection();
foreach ($this->ParsedSelectorCollectionDTO->getParsedSelectorDTO() as $selector) {
$nodes = [$node];
if (\count($selector->getRules()) == 0) {
continue;
}
$options = [];
foreach ($selector->getRules() as $rule) {
if ($rule->isAlterNext()) {
$options[] = $this->alterNext($rule);
continue;
}
$nodes = $this->seeker->seek($nodes, $rule, $options);
// clear the options
$options = [];
}
// this is the final set of nodes
foreach ($nodes as $result) {
$results[] = $result;
}
}
return $results;
}
/**
* Attempts to figure out what the alteration will be for
* the next element.
*/
private function alterNext(RuleDTO $rule): array
{
$options = [];
if ($rule->getTag() == '>') {
$options['checkGrandChildren'] = false;
}
return $options;
}
}
@@ -0,0 +1,114 @@
<?php
declare(strict_types=1);
namespace PHPHtmlParser;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Client;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
use PHPHtmlParser\Exceptions\CircularException;
use PHPHtmlParser\Exceptions\NotLoadedException;
use PHPHtmlParser\Exceptions\StrictException;
use Psr\Http\Client\ClientInterface;
use Psr\Http\Message\RequestInterface;
/**
* Class StaticDom.
*/
final class StaticDom
{
private static $dom = null;
/**
* Attempts to call the given method on the most recent created dom
* from bellow.
*
* @throws NotLoadedException
*
* @return mixed
*/
public static function __callStatic(string $method, array $arguments)
{
if (self::$dom instanceof Dom) {
return \call_user_func_array([self::$dom, $method], $arguments);
}
throw new NotLoadedException('The dom is not loaded. Can not call a dom method.');
}
/**
* Call this to mount the static facade. The facade allows you to use
* this object as a $className.
*
* @param ?Dom $dom
*/
public static function mount(string $className = 'Dom', ?Dom $dom = null): bool
{
if (\class_exists($className)) {
return false;
}
\class_alias(__CLASS__, $className);
if ($dom instanceof Dom) {
self::$dom = $dom;
}
return true;
}
/**
* Creates a new dom object and calls loadFromFile() on the
* new object.
*
* @throws ChildNotFoundException
* @throws CircularException
* @throws StrictException
* @throws Exceptions\LogicalException
*/
public static function loadFromFile(string $file, ?Options $options = null): Dom
{
$dom = new Dom();
self::$dom = $dom;
return $dom->loadFromFile($file, $options);
}
/**
* Creates a new dom object and calls loadFromUrl() on the
* new object.
*
* @throws ChildNotFoundException
* @throws CircularException
* @throws StrictException
* @throws \Psr\Http\Client\ClientExceptionInterface
*/
public static function loadFromUrl(string $url, ?Options $options = null, ClientInterface $client = null, RequestInterface $request = null): Dom
{
$dom = new Dom();
self::$dom = $dom;
if (\is_null($client)) {
$client = new Client();
}
if (\is_null($request)) {
$request = new Request('GET', $url);
}
return $dom->loadFromUrl($url, $options, $client, $request);
}
public static function loadStr(string $str, ?Options $options = null): Dom
{
$dom = new Dom();
self::$dom = $dom;
return $dom->loadStr($str, $options);
}
/**
* Sets the $dom variable to null.
*/
public static function unload(): void
{
self::$dom = null;
}
}
@@ -0,0 +1,21 @@
# PHPHtmlParser Contribution Guide
This page contains guidelines for contributing to the PHPHtmlParser package. Please review these guidelines before submitting any puLl requests to the package.
## Pull Requests
The pull request process differs for new features and bugs. Before sending a pull request for a new feature, you should first create an issue with `[Proposal]` in the title. The proposal should describe the new feature, as well as implementation ideas. The proposal will then be reviewed and either approved or denied. Once a proposal is approved, a pull request may be created implementing the new feature. Pull requests which do not follow this guideline will be closed immediately.
Pull requests for bugs may be sent without creating any proposal issue. If you believe that you know of a solution for a bug that has been filed on Github, please leave a comment detailing your proposed fix.
### Feature Requests
If you have an idea for a new feature you would like to see added to the package, you may create an issue on Github with `[Request]` in the title. The feature request will then be reviewed.
## Coding Guidelines
We follow the [PSR-0](https://github.com/php-fig/fig-standards/blob/master/accepted/PSR-0.md) autoloading standard and take heavily from the [PSR-1](https://github.com/php-fig/fig-standards/blob/master/accepted/PSR-1-basic-coding-standard.md) coding standards. In addition to these standards, below is a list of other coding standards that should be followed:
- Class opening `{` should be on the same line as the class name.
- Function and control structure opening `{` should be on a separate line.
- Interface names are suffixed with `Interface` (`FooInterface`)
@@ -0,0 +1,28 @@
String Encode
==========================
Version 1.0.1
String Encode is a simple PHP wrapper package to facilitate the encoding of strings in different charsets.
Install
-------
This package can be found on [packagist](https://packagist.org/packages/paquettg/stringencode) and is best loaded using [composer](http://getcomposer.org/). It does require php 7.1 or higher, so keep that in consideration.
Usage
-----
This is a really simple package so there is not much to say about it. The following is just about the only usage for this package at the moment.
```php
use stringEncode\Encode;
$str = "Calendrier de l'avent façon Necta!"
$encode = new Encode;
$encode->detect($str);
$newstr = $encode->convert($str);
echo $newstr; // "Calendrier de l'avent façon Necta!" in UTF-8 encoding (default)
```
As you can see, it is a very simple encoding converter.
@@ -0,0 +1,28 @@
{
"name": "paquettg/string-encode",
"type": "library",
"description": "Facilitating the process of altering string encoding in PHP.",
"version": "1.0.1",
"keywords": ["encoding", "charset", "string"],
"homepage": "https://github.com/paquettg/string-encoder",
"license": "MIT",
"authors": [
{
"name": "Gilles Paquette",
"email": "paquettg@gmail.com",
"homepage": "http://gillespaquette.ca"
}
],
"require": {
"php": ">=7.1"
},
"require-dev": {
"phpunit/phpunit": "^7.5.1"
},
"autoload": {
"psr-0": {
"stringEncode": "src/"
}
},
"minimum-stability": "dev"
}
@@ -0,0 +1,28 @@
<?php
/*
|--------------------------------------------------------------------------
| Register The Composer Auto Loader
|--------------------------------------------------------------------------
|
| Composer provides a convenient, automatically generated class loader
| for our application. We just need to utilize it! We'll require it
| into the script here so that we do not have to worry about the
| loading of any our classes "manually". Feels great to relax.
|
*/
require __DIR__.'/vendor/autoload.php';
/*
|--------------------------------------------------------------------------
| Set The Default Timezone
|--------------------------------------------------------------------------
|
| Here we will set the default timezone for PHP. PHP is notoriously mean
| if the timezone is not explicitly set. This will be used by each of
| the PHP date and date-time functions throughout the application.
|
*/
date_default_timezone_set('UTC');
@@ -0,0 +1,26 @@
<?xml version="1.0" encoding="UTF-8"?>
<phpunit backupGlobals="false"
backupStaticAttributes="false"
bootstrap="phpunit.php"
colors="true"
convertErrorsToExceptions="true"
convertNoticesToExceptions="true"
convertWarningsToExceptions="true"
processIsolation="false"
stopOnFailure="false"
>
<testsuites>
<testsuite name="Repository Test Suite">
<directory>./tests/</directory>
</testsuite>
</testsuites>
<filter>
<whitelist addUncoveredFilesFromWhitelist="false">
<directory suffix=".php">src</directory>
<exclude>
<directory suffix=".php">vendor</directory>
</exclude>
</whitelist>
</filter>
</phpunit>
@@ -0,0 +1,121 @@
<?php
namespace stringEncode;
class Encode {
/**
* The encoding that the string is currently in.
*
* @var string
*/
protected $from;
/**
* The encoding that we would like the string to be in.
*
* @var string
*/
protected $to;
/**
* Sets the default charsets for thie package.
*/
public function __construct()
{
// default from encoding
$this->from = 'CP1252';
// default to encoding
$this->to = 'UTF-8';
}
/**
* Sets the charset that we will be converting to.
*
* @param string $charset
* @chainable
*/
public function to($charset)
{
$this->to = strtoupper($charset);
return $this;
}
/**
* Sets the charset that we will be converting from.
*
* @param string $charset
* @chainable
*/
public function from($charset)
{
$this->from = strtoupper($charset);
}
/**
* Returns the to and from charset that we will be using.
*
* @return array
*/
public function charset()
{
return [
'from' => $this->from,
'to' => $this->to,
];
}
/**
* Attempts to detect the encoding of the given string from the encodingList.
*
* @param string $str
* @param array $encodingList
* @return bool
*/
public function detect($str, $encodingList = ['UTF-8', 'CP1252'])
{
$charset = mb_detect_encoding($str, $encodingList);
if ($charset === false)
{
// could not detect charset
return false;
}
$this->from = $charset;
return true;
}
/**
* Attempts to convert the string to the proper charset.
*
* @return string
*/
public function convert($str)
{
if ($this->from != $this->to)
{
$str = iconv($this->from, $this->to, $str);
}
if ($str === false)
{
// the convertion was a failure
throw new Exception('The convertion from "'.$this->from.'" to "'.$this->to.'" was a failure.');
}
// deal with BOM issue for utf-8 text
if ($this->to == 'UTF-8')
{
if (substr($str, 0, 3) == "\xef\xbb\xbf")
{
$str = substr($str, 3);
}
if (substr($str, -3, 3) == "\xef\xbb\xbf")
{
$str = substr($str, 0, -3);
}
}
return $str;
}
}
@@ -0,0 +1,4 @@
<?php
namespace stringEncode;
class Exception extends \Exception {}
@@ -0,0 +1,30 @@
<?php
declare(strict_types=1);
use PHPUnit\Framework\TestCase;
use stringEncode\Encode;
class ContentTest extends TestCase {
public function testTo()
{
$encode = new Encode;
$encode->to('ISO-8859-1');
$this->assertEquals('ISO-8859-1', $encode->charset()['to']);
}
public function testFrom()
{
$encode = new Encode;
$encode->from('ISO-8859-1');
$this->assertEquals('ISO-8859-1', $encode->charset()['from']);
}
public function testDetect()
{
$encode = new Encode;
$encode->detect('Calendrier de l\'avent façon Necta!');
$this->assertEquals('UTF-8', $encode->charset()['from']);
}
}