From f50a8f68414e608fa07f24e4e1238b3d1eb2678b Mon Sep 17 00:00:00 2001
From: Andres Rey
Date: Sat, 10 Mar 2018 12:05:09 +0000
Subject: [PATCH 01/36] Add missing DOMEntity class
---
src/Nodes/DOM/DOMDocument.php | 3 ++-
src/Nodes/DOM/DOMEntity.php | 10 ++++++++++
2 files changed, 12 insertions(+), 1 deletion(-)
create mode 100644 src/Nodes/DOM/DOMEntity.php
diff --git a/src/Nodes/DOM/DOMDocument.php b/src/Nodes/DOM/DOMDocument.php
index a83f5b9c..81e9c7de 100644
--- a/src/Nodes/DOM/DOMDocument.php
+++ b/src/Nodes/DOM/DOMDocument.php
@@ -20,10 +20,11 @@ public function __construct($version, $encoding)
$this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class);
$this->registerNodeClass('DOMDocumentType', DOMDocumentType::class);
$this->registerNodeClass('DOMElement', DOMElement::class);
+ $this->registerNodeClass('DOMEntity', DOMEntity::class);
+ $this->registerNodeClass('DOMEntityReference', DOMEntityReference::class);
$this->registerNodeClass('DOMNode', DOMNode::class);
$this->registerNodeClass('DOMNotation', DOMNotation::class);
$this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class);
$this->registerNodeClass('DOMText', DOMText::class);
- $this->registerNodeClass('DOMEntityReference', DOMEntityReference::class);
}
}
diff --git a/src/Nodes/DOM/DOMEntity.php b/src/Nodes/DOM/DOMEntity.php
new file mode 100644
index 00000000..8493e731
--- /dev/null
+++ b/src/Nodes/DOM/DOMEntity.php
@@ -0,0 +1,10 @@
+
Date: Sat, 10 Mar 2018 17:40:39 +0000
Subject: [PATCH 02/36] Add _cleanClasses function
---
src/Nodes/NodeTrait.php | 4 ++++
src/Readability.php | 24 ++++++++++++++++++++++++
2 files changed, 28 insertions(+)
diff --git a/src/Nodes/NodeTrait.php b/src/Nodes/NodeTrait.php
index 5a3cd7f2..bb848ab8 100644
--- a/src/Nodes/NodeTrait.php
+++ b/src/Nodes/NodeTrait.php
@@ -7,6 +7,10 @@
use andreskrey\Readability\Nodes\DOM\DOMNode;
use andreskrey\Readability\Nodes\DOM\DOMText;
+
+/**
+ * @method \DOMNode removeAttribute($name)
+ */
trait NodeTrait
{
/**
diff --git a/src/Readability.php b/src/Readability.php
index 91e703ce..9a293139 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -1479,6 +1479,28 @@ public function _cleanHeaders(DOMDocument $article)
}
}
+ /**
+ * Removes the class="" attribute from every element in the given
+ * subtree.
+ *
+ * Readability.js has a special filter to avoid cleaning the classes that the algorithm adds. We don't add classes
+ * here so no need to filter those.
+ *
+ * @param DOMDocument|DOMNode $node
+ *
+ * @return void
+ **/
+ public function _cleanClasses($node)
+ {
+ if ($node->getAttribute('class') !== '') {
+ $node->removeAttribute('class');
+ }
+
+ for ($node = $node->firstChild; $node !== null; $node = $node->nextSibling) {
+ $this->_cleanClasses($node);
+ }
+ }
+
/**
* @param DOMDocument $article
*
@@ -1532,6 +1554,8 @@ public function postProcessContent(DOMDocument $article)
}
}
+ $this->_cleanClasses($article);
+
return $article;
}
From 746dd0bcf5f3b0e685d842252c620c01faff19b9 Mon Sep 17 00:00:00 2001
From: Andres Rey
Date: Sat, 10 Mar 2018 17:49:00 +0000
Subject: [PATCH 03/36] Remove all class attributes from the tests
---
test/test-pages/001/expected.html | 2 +-
test/test-pages/002/expected.html | 70 +--
test/test-pages/ars-1/expected.html | 56 +--
test/test-pages/bbc-1/expected.html | 22 +-
test/test-pages/blogger/expected.html | 14 +-
test/test-pages/breitbart/expected.html | 16 +-
test/test-pages/bug-1255978/expected.html | 46 +-
test/test-pages/buzzfeed-1/expected.html | 34 +-
test/test-pages/challenges/expected.html | 2 +-
test/test-pages/cnet/expected.html | 6 +-
test/test-pages/cnn/expected.html | 26 +-
.../test-pages/daringfireball-1/expected.html | 2 +-
test/test-pages/ehow-1/expected.html | 100 ++---
test/test-pages/ehow-2/expected.html | 92 ++--
test/test-pages/gmw/expected.html | 4 +-
test/test-pages/heise/expected.html | 12 +-
test/test-pages/herald-sun-1/expected.html | 14 +-
test/test-pages/iab-1/expected.html | 12 +-
test/test-pages/ietf-1/expected.html | 188 ++++----
test/test-pages/infobae/expected.html | 2 +-
test/test-pages/keep-images/expected.html | 256 +++++------
test/test-pages/lemonde-1/expected.html | 28 +-
test/test-pages/lemonde-2/expected.html | 20 +-
test/test-pages/liberation-1/expected.html | 2 +-
.../expected.html | 26 +-
.../lifehacker-working/expected.html | 26 +-
test/test-pages/links-in-tables/expected.html | 6 +-
test/test-pages/lwn-1/expected.html | 176 ++++----
test/test-pages/medium-1/expected.html | 206 ++++-----
test/test-pages/medium-2/expected.html | 2 +-
test/test-pages/medium-3/expected.html | 410 +++++++++---------
test/test-pages/mozilla-1/expected.html | 64 +--
test/test-pages/mozilla-2/expected.html | 72 +--
test/test-pages/msn/expected.html | 12 +-
.../needs-entity-normalization/expected.html | 4 +-
test/test-pages/nytimes-1/expected.html | 58 +--
test/test-pages/nytimes-2/expected.html | 58 +--
test/test-pages/pixnet/expected.html | 14 +-
test/test-pages/salon-1/expected.html | 2 +-
test/test-pages/simplyfound-1/expected.html | 2 +-
test/test-pages/social-buttons/expected.html | 2 +-
.../table-style-attributes/expected.html | 2 +-
test/test-pages/telegraph/expected.html | 42 +-
test/test-pages/tmz-1/expected.html | 10 +-
test/test-pages/tumblr/expected.html | 4 +-
test/test-pages/wapo-1/expected.html | 20 +-
test/test-pages/wapo-2/expected.html | 6 +-
test/test-pages/webmd-1/expected.html | 10 +-
test/test-pages/webmd-2/expected.html | 2 +-
test/test-pages/wikia/expected.html | 10 +-
test/test-pages/wikipedia/expected.html | 354 +++++++--------
test/test-pages/wordpress/expected.html | 8 +-
test/test-pages/yahoo-1/expected.html | 70 +--
test/test-pages/yahoo-2/expected.html | 40 +-
test/test-pages/yahoo-3/expected.html | 14 +-
test/test-pages/yahoo-4/expected.html | 2 +-
test/test-pages/youth/expected.html | 6 +-
57 files changed, 1383 insertions(+), 1383 deletions(-)
diff --git a/test/test-pages/001/expected.html b/test/test-pages/001/expected.html
index c101aecd..e05810ff 100644
--- a/test/test-pages/001/expected.html
+++ b/test/test-pages/001/expected.html
@@ -13,7 +13,7 @@
I guess.
Actually I've only found one which provides an adapter for Mocha and
actually works…
-