#!/usr/bin/perl # docx2txt - convert docx documents to text format # Sandeep Kumar # tweaked by Steve Kinzler, kinzler@cs.indiana.edu, Oct 11 # http://docx2txt.sourceforge.net/ # http://www.cs.indiana.edu/~kinzler/home.html#other # # The default settings below can be overridden via docx2txt.config - searched # first in current directory and then in the same location as this script. # our $unzip = 'unzip'; # Windows path like 'C:/path/to/unzip.exe' our $newLine = "\n"; # Alternative is "\r\n". our $listIndent = " "; # Indent nested lists by "\t", " " etc. our $lineWidth = 75; # Line width, used for short line justification. our $showHyperLink = "Y"; # Show hyperlink alongside linked text. # docx2txt, a command-line utility to convert Docx documents to text format. # Copyright (C) 2008-2009 Sandeep Kumar # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # # This script extracts text from document.xml contained inside .docx file. # Perl v5.8.2 was used for testing this script. # # Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM) # # ChangeLog : # # 10/08/2008 - Initial version (v0.1) # 15/08/2008 - Script takes two arguments [second optional] now and can be # used independently to extract text from docx file. It accepts # docx file directly, instead of xml file. # 18/08/2008 - Added support for center and right justification of text that # fits in a line 80 characters wide (adjustable). # 03/09/2008 - Fixed the slip in usage message. # 12/09/2008 - Slightly changed the script invocation and argument handling # to incorporate some of the shell script functionality here. # Added support to handle embedded urls in docx document. # 23/09/2008 - Changed #! line to use /usr/bin/env - good suggestion from # Rene Maroufi (info>ATDOTATDOT '&', gt => '>', lt => '<', acute => '\'', brvbar => '|', copy => '(C)', divide => '/', laquo => '<<', macr => '-', nbsp => ' ', raquo => '>>', reg => '(R)', shy => '-', times => 'x' ); my %splchars = ( "\xC2\xA0" => ' ', # "\xC2\xA6" => '|', # "\xC2\xA9" => '(C)', # "\xC2\xAB" => '<<', # "\xC2\xAC" => '-', # "\xC2\xAE" => '(R)', # "\xC2\xB1" => '+-', # "\xC2\xBB" => '>>', # # "\xC2\xA7" => '', #
# "\xC2\xB6" => '', # "\xC3\x97" => 'x', # "\xC3\xB7" => '/', #
"\xE2\x80\x82" => ' ', # "\xE2\x80\x83" => ' ', # "\xE2\x80\x85" => ' ', # "\xE2\x80\x93" => ' - ', # "\xE2\x80\x94" => ' -- ', # "\xE2\x80\x98" => '`', # "\xE2\x80\x99" => '\'', # "\xE2\x80\x9C" => '"', # "\xE2\x80\x9D" => '"', # "\xE2\x80\xA2" => '::', # "\xE2\x80\xA6" => '...', # "\xE2\x84\xA2" => '(TM)', # "\xE2\x89\xA0" => '!=', # "\xE2\x89\xA4" => '<=', # "\xE2\x89\xA5" => '>=', # # # Currency symbols # "\xC2\xA2" => 'cent', "\xC2\xA3" => 'Pound', "\xC2\xA5" => 'Yen', "\xE2\x82\xAC" => 'Euro' ); # # Check argument(s) sanity. # # kinzler 2011-10-24 kludge if ($ARGV[0] eq '-') { umask 077; $tmp = "/tmp/docx2txt.$$"; $SIG{'HUP'} = $SIG{'INT'} = $SIG{'PIPE'} = $SIG{'TERM'} = 'end'; `cat > $tmp`; $ARGV[0] = $tmp; sub end { unlink $tmp; exit } } my $usage = < [outfile.txt|-] Use '-' as the outfile name to dump the text on STDOUT. Output is saved in infile.txt if second argument is omitted. infile.docx can also be a directory name holding the unzipped content of concerned .docx file. USAGE die $usage if (@ARGV == 0 || @ARGV > 2) || $ARGV[0] eq '-h'; # # Check for existence and readability of required file in specified directory, # and whether it is a text file. # sub check_for_required_file_in_folder { stat("$_[1]/$_[0]"); die "Can't read <$_[0]> in <$_[1]>!\n" if ! (-f _ && -r _); die "<$_[1]/$_[0]> does not seem to be a text file!\n" if ! -T _; } sub readFileInto { local $/ = undef; open my $fh, "$_[0]" or die "Couldn't read file <$_[0]>!\n"; binmode $fh; $_[1] = <$fh>; close $fh; } # # Check whether first argument is specifying a directory holding extracted # content of .docx file, or .docx file itself. # stat($ARGV[0]); if (-d _) { check_for_required_file_in_folder("word/document.xml", $ARGV[0]); check_for_required_file_in_folder("word/_rels/document.xml.rels", $ARGV[0]); $inpIsDir = 'y'; } else { die "Can't read docx file <$ARGV[0]>!\n" if ! (-f _ && -r _); die "<$ARGV[0]> does not seem to be docx file!\n" if -T _; } # # Get user configuration, if any. # my %config; if (-f "docx2txt.config") { %config = do 'docx2txt.config'; } elsif ($0 =~ m%^(.*[/\\])[^/\\]*?$%) { %config = do "$1docx2txt.config" if (-f "$1docx2txt.config"); } if (%config) { foreach my $var (keys %config) { $$var = $config{$var}; } } # # Extract xml document content from argument docx file/directory. # if ($ENV{OS} =~ /^Windows/) { $nulldevice = "nul"; } else { $nulldevice = "/dev/null"; } if ($inpIsDir eq 'y') { readFileInto("$ARGV[0]/word/document.xml", $content); } else { $content = `"$unzip" -p "$ARGV[0]" word/document.xml 2>$nulldevice`; } die "Failed to extract required information from <$ARGV[0]>!\n" if ! $content; # # Be ready for outputting the extracted text contents. # if (@ARGV == 1) { $ARGV[1] = $ARGV[0]; # Remove any trailing slashes to generate proper output filename, when # input is directory. $ARGV[1] =~ s%[/\\]+$%% if ($inpIsDir eq 'y'); $ARGV[1] .= ".txt" if !($ARGV[1] =~ s/\.docx$/\.txt/); } my $txtfile; open($txtfile, "> $ARGV[1]") || die "Can't create <$ARGV[1]> for output!\n"; binmode $txtfile; # Ensure no auto-conversion of '\n' to '\r\n' on Windows. # # Gather information about header, footer, hyperlinks, images, footnotes etc. # if ($inpIsDir eq 'y') { readFileInto("$ARGV[0]/word/_rels/document.xml.rels", $_); } else { $_ = `"$unzip" -p "$ARGV[0]" word/_rels/document.xml.rels 2>$nulldevice`; } # kinzler 2011-10-24 kludge unlink $tmp if $tmp; my %docurels; while (//g) { $docurels{"$2:$1"} = $3; } # # Subroutines for center and right justification of text in a line. # sub justify { my $len = length $_[1]; if ($_[0] eq "center" && $len < ($lineWidth - 1)) { return ' ' x (($lineWidth - $len) / 2) . $_[1]; } elsif ($_[0] eq "right" && $len < $lineWidth) { return ' ' x ($lineWidth - $len) . $_[1]; } else { return $_[1]; } } # # Subroutines for dealing with embedded links and images # sub hyperlink { my $hlrid = $_[0]; my $hltext = $_[1]; my $hlink = $docurels{"hyperlink:$hlrid"}; $hltext =~ s/<[^>]*?>//og; $hltext .= " [HYPERLINK: $hlink]" if ($showHyperLink eq "y" && $hltext ne $hlink); return $hltext; } # # Subroutines for processing paragraph content. # sub processParagraph { my $para = $_[0] . "$newLine"; my $align = $1 if ($_[0] =~ //); $para =~ s/<.*?>//og; return justify($align,$para) if $align; return $para; } # # Force configuration value to lowercase as expected by script. # $showHyperLink = lc $showHyperLink; # # Text extraction starts. # my %tag2chr = (tab => "\t", noBreakHyphen => "-", softHyphen => " - "); $content =~ s/(\r)?\n//; # Remove stuff between TOC related tags. if ($content =~ m||) { $content =~ s|]*>.*?||og; } $content =~ s{}|$tag2chr{$1}|og; my $hr = '-' x $lineWidth . $newLine; $content =~ s|.*?|$hr|og; $content =~ s||$listIndent x $1 . "$levchar[$1] "|oge; # # Uncomment either of below two lines and comment above line, if dealing # with more than 8 level nested lists. # # $content =~ s||$listIndent x $1 . '* '|oge; # $content =~ s||'*' x ($1+1) . ' '|oge; $content =~ s{.*?(|]+>)(.*?)}/uc $2/oge; $content =~ s{(.*?)}/hyperlink($1,$2)/oge; $content =~ s/]+?>(.*?)<\/w:p>/processParagraph($1)/oge; $content =~ s{]+?/>||}|$newLine|og; $content =~ s/<.*?>//og; # # Convert non-ASCII characters/character sequences to ASCII characters. # $content =~ s/(\xE2..|\xC2.|\xC3.)/($splchars{$1} ? $splchars{$1} : $1)/oge; # # Convert docx specific escape chars first. # $content =~ s/(&)(amp|gt|lt)(;)/$escChrs{lc $2}/iog; # # Another pass for a better text experience, after sequences like "&laquo;" # are converted to "«". # $content =~ s/((&)([a-z]+)(;))/($escChrs{lc $3} ? $escChrs{lc $3} : $1)/ioge; # # Write the extracted and converted text contents to output. # print $txtfile $content; close $txtfile;