<?php 
 
// should be good enough for names in fernch bibliographies although it doesn't do stop words 
// ex. AUSTRALIAN GOVERNMENT DEPARTMENT OF HEALTH AND AGEING => Australian Government Department Of Health And Ageing 
 
$file = 'C:\wamp\www\sweeper\not-swept\im-014-fr.html'; 
$contents = file_get_contents($file); 
 
preg_match_all('/<span style="text-transform:uppercase">(.*?)<\/span>/is', $contents, $upperclass_span_matches, PREG_OFFSET_CAPTURE); 
$counter = sizeof($upperclass_span_matches[0]) - 1; 
print('<table>'); 
while($counter > -1) { 
    $span_content = $upperclass_span_matches[1][$counter][0]; 
    $span_offset = $upperclass_span_matches[0][$counter][1]; 
    print('<tr> 
<th align="left">' . $span_content . '</th> 
<td>'); 
    $counter2 = 0; 
    $parsing_word = false; 
    $parsing_characer_entity = false; 
    $parsing_mac = false; 
    $possibly_parsing_mac = false; 
    $new_span_content = ''; 
    while($counter2 < strlen($span_content)) { 
        if($parsing_characer_entity) { 
            if($span_content[$counter2] === ';') { 
                $parsing_characer_entity = false; 
            } 
            $new_span_content .= strtolower($span_content[$counter2]); // notice that the intention is for character entities to also be converted to lower class by this 
        } else { 
            if($span_content[$counter2] === '&') { 
                $parsing_characer_entity = true; 
                $new_span_content .= $span_content[$counter2]; 
            } else { 
                 if(!$parsing_word) { 
                    if($span_content[$counter2] === 'M' || $span_content[$counter2] === 'm') { 
                        $possibly_parsing_mac = true; 
                        $parsing_word = true; 
                        $new_span_content .= strtoupper($span_content[$counter2]); 
                    } elseif(preg_match('/[A-Z]/is', $span_content[$counter2])) { 
                        $parsing_word = true; 
                        $new_span_content .= strtoupper($span_content[$counter2]); 
                    } else { 
                        $new_span_content .= strtolower($span_content[$counter2]); 
                    } 
                } else { 
                    if($possibly_parsing_mac) { 
                        if($span_content[$counter2] === 'C' || $span_content[$counter2] === 'c') { 
                            $parsing_mac = true; 
                        } elseif(!preg_match('/[A-Z]/is', $span_content[$counter2])) { 
                            $parsing_word = false; 
                        } 
                        $new_span_content .= strtolower($span_content[$counter2]); 
                        $possibly_parsing_mac = false; 
                    } elseif($parsing_mac) { 
                        $new_span_content .= strtoupper($span_content[$counter2]); 
                        $parsing_mac = false; 
                    } else { 
                        if($span_content[$counter2] === 'C' || $span_content[$counter2] === 'c') { 
                            if($possibly_parsing_mac) { 
                                $parsing_mac = true; 
                                $possibly_parsing_mac = false; 
                            } 
                        } elseif(!preg_match('/[A-Z]/is', $span_content[$counter2])) { 
                            $parsing_word = false; 
                        } 
                        $new_span_content .= strtolower($span_content[$counter2]); 
                    } 
                } 
            } 
        } 
        $counter2++; 
    } 
    print($new_span_content . '</td> 
</tr> 
'); 
    $contents = substr($contents, 0, $span_offset) . '<span style="text-transform:uppercase">' . $new_span_content . '</span>' . substr($contents, $span_offset + strlen($upperclass_span_matches[0][$counter][0])); 
    $counter--; 
} 
print('</table>'); 
 
file_put_contents($file, $contents); 
 
?>
 
 |