<?php
// www.jackreichert.com/2012/11/09/how-to-convert-docx-to-html/
// some fixes by bron
// location of xml file (save docx as xml file)
$myFile = 'test.xml';
$myDir = dirname(__FILE__).'/word/';
$xmlFile = $myDir.$myFile;
$reader = new XMLReader;
$reader->open($xmlFile);
// setup variables for formatting
$text = '';
$formatting['bold'] = 'closed';
$formatting['italic'] = 'closed';
$formatting['underline'] = 'closed';
$formatting['header'] = 0;
// loop through docx xml dom
while ($reader->read()){
// look for new paragraphs
if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:p'){
// new instance of XMLReader for parsing paragraph independantly
$paragraph = new XMLReader;
$p = $reader->readOuterXML();
$paragraph->xml($p);
// search for heading
preg_match('/w:pStyle w:val="(Header|Kop)([1-6])"/',$p,$matches);
if (!empty($matches)) {
$pos = strpos('0123456', $matches['2']);
$formatting['header'] = ($pos === false) ? 0 : $pos;
}else{
$formatting['header'] = 0;
}
// open <h> or <p> tag
$text .= ($formatting['header'] > 0) ? '<h'.$formatting['header'].'>' : '<p>';
// loop through paragraph dom
while ($paragraph->read()){
// look for elements
if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:r'){
$node = trim($paragraph->readInnerXML());
// add <br /> tags
if (strstr($node,'<w:br ')) $text .= '<br />';
// look for formatting tags
$formatting['bold'] = (strstr($node,'<w:b/>')) ? (($formatting['bold'] == 'closed') ? 'open' : $formatting['bold']) : (($formatting['bold'] == 'opened') ? 'close' : $formatting['bold']);
$formatting['italic'] = (strstr($node,'<w:i/>')) ? (($formatting['italic'] == 'closed') ? 'open' : $formatting['italic']) : (($formatting['italic'] == 'opened') ? 'close' : $formatting['italic']);
$formatting['underline'] = (strstr($node,'<w:u ')) ? (($formatting['underline'] == 'closed') ? 'open' : $formatting['underline']) : (($formatting['underline'] == 'opened') ? 'close' : $formatting['underline']);
// build text string of doc
$text .= (($formatting['bold'] == 'open') ? '<strong>' : '').
(($formatting['italic'] == 'open') ? '<em>' : '').
(($formatting['underline'] == 'open') ? '<u>' : '').
htmlentities(iconv('UTF-8', 'ASCII//TRANSLIT',$paragraph->expand()->textContent)).
(($formatting['underline'] == 'close') ? '</u>' : '').
(($formatting['italic'] == 'close') ? '</em>' : '').
(($formatting['bold'] == 'close') ? '</strong>' : '');
// reset formatting variables
foreach ($formatting as $key=>$format){
if ($format == 'open') $formatting[$key] = 'opened';
if ($format == 'close') $formatting[$key] = 'closed';
}
}
}
$text .= ($formatting['header'] > 0) ? '</h'.$formatting['header'].'>' : '</p>';
}
}
$reader->close();
// suppress warnings. fix invalid html
$doc = new DOMDocument();
$doc->encoding = 'UTF-8';
@$doc->loadHTML($text);
$goodHTML = simplexml_import_dom($doc)->asXML();
$goodHTML = preg_replace('/<p\/>/','',$goodHTML);
echo $goodHTML;
?>