The following code seems to work
$dom= new DOMDocument('1.0', 'UTF-8');
$dom->loadHTML($this->htmlentities2stringcode(rawurldecode($content)) );
$dom->preserveWhiteSpace = true;
$innerHTML = str_replace("<html></html><html><body>", "",
str_replace("</body></html>", "",
str_replace("+","%2B",str_replace("<p></p>", "", $this->getInnerHTML( $dom )))));
return $this->stringcode2htmlentities($innerHTML));
}
// ----------------------------------------------------------
function htmlentities2stringcode($string) {
// This method will convert htmlentities such as © into the pseudo string version ^copy^; etc
$from = array_keys($this->getHTMLEntityStringCodeArray());
$to = array_values($this->getHTMLEntityStringCodeArray());
return str_replace($from, $to, $string);
}
// ----------------------------------------------------------
function stringcode2htmlentities ($string) {
// This method will convert pseudo string such as ^copy^ to the original html entity © etc
$from = array_values($this->getHTMLEntityStringCodeArray());
$to = array_keys($this->getHTMLEntityStringCodeArray());
return str_replace($from, $to, $string);
}
// -------------------------------------------------------------
function getHTMLEntityStringCodeArray() {
return array('Α'=>'^Alpha^',
'Β'=>'^Beta^',
'Χ'=>'^Chi^',
'‡'=>'^Dagger^',
'Δ'=>'^Delta^',
'Ε'=>'^Epsilon^',
'Η'=>'^Eta^',
'Γ'=>'^Gamma^',
'Ι'=>'^lota^',
'Κ'=>'^Kappa^',
'Λ'=>'^Lambda^',
'Μ'=>'^Mu^',
'Ν'=>'^Nu^',
'Œ'=>'^OElig^',
'Ω'=>'^Omega^',
'Ο'=>'^Omicron^',
'Φ'=>'^Phi^',
'Π'=>'^Pi^',
'″'=>'^Prime^',
'Ψ'=>'^Psi^',
'Ρ'=>'^Rho^',
'Š'=>'^Scaron^',
'Š'=>'^Scaron^',
'Σ'=>'^Sigma^',
'Τ'=>'^Tau^',
'Θ'=>'^Theta^',
'Υ'=>'^Upsilon^',
'Ξ'=>'^Xi^',
'Ÿ'=>'^Yuml^',
'Ζ'=>'^Zeta^',
'ℵ'=>'^alefsym^',
'α'=>'^alpha^',
'∧'=>'^and^',
'∠'=>'^ang^',
'≈'=>'^asymp^',
'„'=>'^bdquo^',
'β'=>'^beta^',
'•'=>'^bull^',
'∩'=>'^cap^',
'χ'=>'^chi^',
'ˆ'=>'^circ^',
'♣'=>'^clubs^',
'≅'=>'^cong^',
'↵'=>'^crarr^',
'∪'=>'^cup^',
'⇓'=>'^dArr^',
'†'=>'^dagger^',
'↓'=>'^darr^',
'δ'=>'^delta^',
'♦'=>'^diams^',
'∅'=>'^empty^',
' '=>'^emsp^',
' '=>'^ensp^',
'ε'=>'^epsilon^',
'≡'=>'^equiv^',
'η'=>'^eta^',
'€'=>'^euro^',
'∃'=>'^exist^',
'ƒ'=>'^fnof^',
'∀'=>'^forall^',
'⁄'=>'^frasl^',
'γ'=>'^gamma^',
'≥'=>'^ge^',
'⇔'=>'^hArr^',
'↔'=>'^harr^',
'♥'=>'^hearts^',
'…'=>'^hellip^',
'ℑ'=>'^image^',
'∞'=>'^infin^',
'∫'=>'^int^',
'ι'=>'^iota^',
'∈'=>'^isin^',
'κ'=>'^kappa^',
'⇐'=>'^lArr^',
'λ'=>'^lambda^',
'⟨'=>'^lang^',
'←'=>'^larr^',
'⌈'=>'^lceil^',
'“'=>'^ldquo^',
'≤'=>'^le^',
'⌊'=>'^lfloor^',
'∗'=>'^lowast^',
'◊'=>'^loz^',
'‎'=>'^lrm^',
'‹'=>'^lsaquo^',
'‘'=>'^lsquo^',
'—'=>'^mdash^',
'−'=>'^minus^',
'μ'=>'^mu^',
'∇'=>'^nabla^',
'–'=>'^ndash^',
'≠'=>'^ne^',
'∋'=>'^ni^',
'∉'=>'^notin^',
'⊄'=>'^nsub^',
'ν'=>'^nu^',
'œ'=>'^oelig^',
'‾'=>'^oline^',
'ω'=>'^omega^',
'ο'=>'^omicron^',
'⊕'=>'^oplus^',
'∨'=>'^or^',
'⊗'=>'^otimes^',
'∂'=>'^part^',
'‰'=>'^permil^',
'⊥'=>'^perp^',
'φ'=>'^phi^',
'π'=>'^pi^',
'ϖ'=>'^piv^',
'′'=>'^prime^',
'∏'=>'^prod^',
'∝'=>'^prop^',
'ψ'=>'^psi^',
'⇒'=>'^rArr^',
'√'=>'^radic^',
'⟩'=>'^rang^',
'→'=>'^rarr^',
'⌉'=>'^rceil^',
'”'=>'^rdquo^',
'ℜ'=>'^real^',
'⌋'=>'^rfloor^',
'ρ'=>'^rho^',
'‏'=>'^rlm^',
'›'=>'^rsaquo^',
'’'=>'^rsquo^',
'‚'=>'^sbquo^',
'š'=>'^scaron^',
'⋅'=>'^sdot^',
'σ'=>'^sigma^',
'ς'=>'^sigmaf^',
'∼'=>'^sim^',
'♠'=>'^spades^',
'⊂'=>'^sub^',
'⊆'=>'^sube^',
'∑'=>'^sum^',
'⊃'=>'^sup^',
'⊇'=>'^supe^',
'τ'=>'^tau^',
'∴'=>'^there4^',
'θ'=>'^thetasym^',
'ϑ'=>'^thetasym^',
' '=>'^thinsp^',
'˜'=>'^tilde^',
'™'=>'^trade^',
'⇑'=>'^uArr^',
'↑'=>'^uarr^',
'ϒ'=>'^upsih^',
'υ'=>'^upsilon^',
'℘'=>'^weierp^',
'ξ'=>'^xi^',
'ÿ'=>'^yuml^',
'ζ'=>'^zeta^',
'‍'=>'^zwj^',
'‌'=>'^zwnj^');
}