Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
176 views
in Technique[技术] by (71.8m points)

Get Inner HTML - PHP

I have the following code:

$data = file_get_contents('http://www.robotevents.com/robot-competitions/vex-robotics-competition?limit=all');
echo "Downloaded";
$dom = new domDocument;

@$dom->loadHTML($data);
$dom->preserveWhiteSpace = false;
$tables = $dom->getElementsByTagName('table');

$rows = $tables->item(2)->getElementsByTagName('tr');

foreach ($rows as $row) {
    $cols = $row->getElementsByTagName('td');
for ($i = 0; $i < $cols->length; $i++) {
    echo $cols->item($i)->nodeValue . "
";
}

}

The final field has an Link which I need to store the URL of. Also, The script outputs characters such as "?". Does anyone know how to do/fix these things?

See Question&Answers more detail:os

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Reply

0 votes
by (71.8m points)

I would recommend not using DOM to parse HTML, as it has problems with invalid HTML. INstead use regular expression

I use this class:

<?php

    /**
    * Class to return HTML elements from a HTML document
    * @version 0.3.1
    */
    class HTMLQuery
    {

        protected $selfClosingTags = array( 'area', 'base', 'br', 'hr', 'img', 'input', 'link', 'meta', 'param' );
        private $html;

        function __construct( $html = false )
        {
            if( $html !== false )
                $this->load( $html );
        }

        /**
        * Load a HTML string
        */
        public function load( $html )
        {
            $this->html = $html;
        }

        /**
        * Returns elements from the HTML
        */
        public function getElements( $element, $attribute_match = false, $value_match = false )
        {
            if( in_array( $element, $this->selfClosingTags ) )
                preg_match_all( "/<$element *(.*)*/>/isU", $this->html, $matches );
            else
                preg_match_all( "/<$element(.*)>(.*)</$element>/isU", $this->html, $matches );

            if( $matches )
            {
                #Create an array of matched elements with attributes and content
                foreach( $matches[0] as $key => $el )
                {
                    $current_el = array( 'name' => $element );
                    $attributes = $this->parseAttributes( $matches[1][$key] );
                    if( $attributes )
                        $current_el['attributes'] = $attributes;
                    if( $matches[2][$key] )
                        $current_el['content'] = $matches[2][$key];

                    $elements[] = $current_el;
                }

                #Return only elements with a specific attribute and or value if specified
                if( $attribute_match != false && $elements )
                {
                    foreach( $elements as $el_key => $current_el )
                    {
                        if( $current_el['attributes'] )
                        {
                            foreach( $current_el['attributes'] as $att_name => $att_value )
                            {
                                $keep = false;
                                if( $att_name == $attribute_match )
                                {
                                    $keep = true;
                                    if( $value_match == false )
                                        break;
                                }
                                if( $value_match && ( $att_value == $value_match ) )
                                {
                                    $keep = true;
                                    break;
                                }
                                elseif( $value_match && ( $att_value != $value_match ) )
                                    $keep = false;
                            }
                            if( $keep == false )
                                unset( $elements[$el_key] );
                        }
                        else
                            unset( $elements[$el_key] );
                    }
                }

            }

            if( $elements )
                return array_values( $elements );
            else
                return array();
        }

        /**
        * Return an associateive array of all the form inputs
        */
        public function getFormValues()
        {
            $inputs = $this->getElements( 'input' );
            $textareas = $this->getElements( 'textarea' );
            $buttons = $this->getElements( 'button' );
            $elements = array_merge( $inputs, $textareas, $buttons );
            if( $elements )
            {
                foreach( $elements as $current_el )
                {
                    $attribute_name = mb_strtolower( $current_el['attributes']['name'] );

                    if( in_array( $current_el['name'], array( 'input', 'button' ) ) )
                    {
                        if( isset( $current_el['attributes']['name'] ) && isset( $current_el['attributes']['value'] ) )
                            $form_values[$attribute_name] = $current_el['attributes']['value'];
                    }
                    else
                    {
                        if( isset( $current_el['attributes']['name'] ) && isset( $current_el['content'] ) )
                            $form_values[$attribute_name] = $current_el['content'];
                    }
                }
            }

            return $form_values;
        }

        /**
        * Parses attributes into an array
        */
        private function parseAttributes( $str )
        {
            $str = trim( rtrim( trim( $str ), '/' ) );
            if( $str )
            {
                preg_match_all( "/([^ =]+)s*=s*["'“”]{0,1}([^"'“”]*)["'“”]{0,1}/i", $str, $matches );
                if( $matches[1] )
                {
                    foreach( $matches[1] as $key => $att )
                    {
                        $attribute_name = mb_strtolower( $att );
                        $attributes[$attribute_name] = $matches[2][$key];
                    }
                }
            }

            return $attributes;
        }

    }

?>

Usage is:

$c = new HTMLQuery();
$x = $c->getElements( 'tr' );
print_r( $x );

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
OGeek|极客中国-欢迎来到极客的世界,一个免费开放的程序员编程交流平台!开放,进步,分享!让技术改变生活,让极客改变未来! Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

...