Update:
This is a response to the "combining" into a single regex...
It appears you don't care about reconstructing the order of the html.
So, if you just want to isolate the content for each sub-section, the below is all you need.
However, you will need lists ( [] ) to reconstitute the order of embedded sub-sections.
After refreshing myself with this question, note that the regex used below is the one you should be using.
use Data::Dumper;
$/ = undef;
my $content = <DATA>;
my $href = {};
ParseCore( $href, $content );
#print Dumper($href);
print "
Base======================
";
print $href->{content};
print "
First======================
";
print $href->{first}->{content};
print "
Second======================
";
print $href->{first}->{second}->{content};
print "
Third======================
";
print $href->{first}->{second}->{third}->{content};
print "
Fourth======================
";
print $href->{first}->{second}->{third}->{fourth}->{content};
print "
Fifth======================
";
print $href->{first}->{second}->{third}->{fourth}->{fifth}->{content};
exit;
sub ParseCore
{
my ($aref, $core) = @_;
my ($k, $v);
while ( $core =~ /(?is)(<!--block:(.*?)-->((?:(?:(?!<!--block:(?:.*?)-->).)|(?R))*?)<!--endblock-->|((?:(?!<!--block:.*?-->).)+))/g )
{
if (defined $2) {
$k = $2; $v = $3;
$aref->{$k} = {};
# $aref->{$k}->{content} = $v;
# $aref->{$k}->{match} = $1;
my $curraref = $aref->{$k};
my $ret = ParseCore($aref->{$k}, $v);
if (defined $ret) {
$curraref->{'#next'} = $ret;
}
}
else
{
$aref->{content} .= $4;
}
}
return $k;
}
#================================================
__DATA__
some html content here top base
<!--block:first-->
<table border="1" style="color:red;">
<tr class="lines">
<td align="left" valign="<--valign-->">
<b>bold</b><a href="http://www.mewsoft.com">mewsoft</a>
<!--hello--> <--again--><!--world-->
some html content here 1 top
<!--block:second-->
some html content here 2 top
<!--block:third-->
some html content here 3 top
<!--block:fourth-->
some html content here 4 top
<!--block:fifth-->
some html content here 5a
some html content here 5b
<!--endblock-->
<!--endblock-->
some html content here 3a
some html content here 3b
<!--endblock-->
some html content here 2 bottom
<!--endblock-->
some html content here 1 bottom
<!--endblock-->
some html content here1-5 bottom base
some html content here 6-8 top base
<!--block:six-->
some html content here 6 top
<!--block:seven-->
some html content here 7 top
<!--block:eight-->
some html content here 8a
some html content here 8b
<!--endblock-->
some html content here 7 bottom
<!--endblock-->
some html content here 6 bottom
<!--endblock-->
some html content here 6-8 bottom base
Output >>
Base======================
some html content here top base
some html content here1-5 bottom base
some html content here 6-8 top base
some html content here 6-8 bottom base
First======================
<table border="1" style="color:red;">
<tr class="lines">
<td align="left" valign="<--valign-->">
<b>bold</b><a href="http://www.mewsoft.com">mewsoft</a>
<!--hello--> <--again--><!--world-->
some html content here 1 top
some html content here 1 bottom
Second======================
some html content here 2 top
some html content here 2 bottom
Third======================
some html content here 3 top
some html content here 3a
some html content here 3b
Fourth======================
some html content here 4 top
Fifth======================
some html content here 5a
some html content here 5b
You can use REGEX recursion to match outter nesting's, then parse the inner CORE's
using a simple recursive function call.
Then its also possible to parse content on the nesting level that you are on.
Its also possible to create a nested structure along the way to enable you to later
do the template substitutions.
You can then reconstruct the html.
The only tricky part is traversing the array. But, if you know how to traverse
array's (scalars, array/hash ref's, and such) it should be no problem.
Here is the sample.
# (?is)<!--block:(.*?)-->((?:(?:(?!<!--(?:.*?)-->).)|(?R))*?)<!--endblock-->|((?:(?!<!--.*?-->).)+)
(?is) # Modifiers: Case insensitive, Dot-all
<!--block: # Begin BLOCK
( .*? ) # (1), block name
-->
( # (2 start), Begin Core
(?:
(?:
(?!
<!--
(?: .*? )
-->
)
.
)
| (?R)
)*?
) # (2 end), End Core
<!--endblock--> # End BLOCK
|
( # (3 start), Or grab content within this core
(?:
(?! <!-- .*? --> )
.
)+
) # (3 end)
Perl test case
use Data::Dumper;
$/ = undef;
my $content = <DATA>;
my %blocks = ();
$blocks{'base'} = [];
ParseCore( $blocks{'base'}, $content );
sub ParseCore
{
my ($aref, $core) = @_;
while ( $core =~ /(?is)<!--block:(.*?)-->((?:(?:(?!<!--(?:.*?)-->).)|(?R))*?)<!--endblock-->|((?:(?!<!--.*?-->).)+)/g )
{
if ( defined $1 )
{
my $branch = {};
push @{$aref}, $branch;
$branch->{$1} = [];
ParseCore( $branch->{$1}, $2 );
}
elsif ( defined $3 )
{
push @{$aref}, $3;
}
}
}
print Dumper(\%blocks);
__DATA__
some html content here top base
<!--block:first-->
some html content here 1 top
<!--block:second-->
some html content here 2 top
<!--block:third-->
some html content here 3a
some html content here 3b
<!--endblock-->
some html content here 2 bottom
<!--endblock-->
some html content here 1 bottom
<!--endblock-->
some html content here bottom base
Output >>
$VAR1 = {
'base' => [
'
some html content here top base
',
{
'first' => [
'
some html content here 1 top
',
{
'second' => [
'
some html content here 2 top
',
{
'third' => [
'
some html content here 3a
some html content here 3b
'
]
},
'
some html content here 2 bottom
'
]
},
'
some html content here 1 bottom
'
]
},
'
some html content here bottom base
'
]
};