#!/usr/bin/env perl
# sxw2txt -- Converts OpenOffice.org Writer files to plain text.
# Copyright (C) 2004 Liam Morland
# Copyright (C) 2006 Vincent Lefevre
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, see .
#
# Liam Morland
# 86A McDougall Road, Waterloo, Ontario, N2L 5C5, CANADA
#
# Changes by Vincent Lefevre .
# 2006-03-21: Better #! line, typos, error messages output to stderr,
# better whitespace removal, clean execution of unzip (for
# filenames with special characters).
# 2006-03-22: Use Archive::Zip, otherwise fallback to the unzip command.
# 2006-03-26: Conversions from Symbol aka "Standard Symbols L" characters
# (in the private Unicode area U+F020..U+F0FF) into standard
# Unicode characters. I don't know whether this private area
# is used in a consistent way by OpenOffice, but IMHO, this
# conversion is better than nothing.
# 2010/2024: Updated copyright notice.
use strict;
use warnings;
my ($proc) = '$Id: sxw2txt 171413 2024-09-01 12:25:23Z vinc17/qaa $'
=~ /^.Id: (\S+) / or die;
# First argument is taken to be the input file. All other args are ignored.
my $input_file = shift;
# If we have a filename, try to get the content.xml from it,
# otherwise print usage information.
defined $input_file or $! = 1, die <new();
};
if (defined $zip)
{
$zip->read($input_file)
and die "$proc: $input_file is unreadable or has an incorrect format\n";
$_ = $zip->contents('content.xml')
or die "$proc: can't extract content.xml from $input_file\n";
}
else
{
my $pid = open UNZIP, "-|";
defined $pid or die "$proc: can't fork: $!\n";
unless ($pid) # child
{
# It's probably better to get the possible error message from unzip,
# to know the reason of the failure (e.g. "Permission denied").
#open STDERR, '/dev/null';
exec 'unzip', '-p', $input_file, 'content.xml';
die "$proc: exec unzip failed: $!\n";
}
$_ = do { local $/; };
close UNZIP or die "$proc: can't extract content.xml from $input_file\n";
}
# Convert the OOo XML to text with a series of regex substitutions.
s,\n+, ,g;
# Tables are wrapped with [begin-table] and [end-table].
# Rows and cells begin with [table-row] and [table-cell] respectively.
s,]*)?>,\n\n[begin-table],g;
s,,\n[end-table],g;
s,]*)?>(<[^>]+>)*]*>,\n[table cell],g;
s,]*)?>,\n\n[table row],g;
# OOo tabs are made into tab characters.
s,,\t,g;
# Each list item is given a '*' as a bullet.
# Sorry, no fancy support for nested lists yet.
s,]*>,\n\n* ,g;
# Skip two lines before each new paragraph.
s,]*>,\n\n,g;
# Get rid of any remaining tags. Want to add support for tags not
# handled above? Do it above this line.
s,<[^>]*>,,g;
# Convert common entities into the appropriate character.
s,<,<,g;
s,>,>,g;
s,',',g;
s,",",g;
s,&,&,g;
# Convert "Standard Symbols L" characters into standard Unicode
# characters. Note: I could have written
# binmode UNZIP, ":utf8";
# just before reading UNZIP to use Unicode code points directly,
# but this makes the substitutions very slow.
my %symbol =
(
"\x{80}\x{A1}" => "!",
"\x{80}\x{A2}" => "\x{E2}\x{88}\x{80}",
"\x{80}\x{A3}" => "#",
"\x{80}\x{A4}" => "\x{E2}\x{88}\x{83}",
"\x{80}\x{A5}" => "%",
"\x{80}\x{A6}" => "&",
"\x{80}\x{A7}" => "\x{C9}\x{9C}",
"\x{80}\x{A8}" => "(",
"\x{80}\x{A9}" => ")",
"\x{80}\x{AA}" => "*",
"\x{80}\x{AB}" => "+",
"\x{80}\x{AC}" => ",",
"\x{80}\x{AD}" => "-",
"\x{80}\x{AE}" => ".",
"\x{80}\x{AF}" => "/",
"\x{80}\x{B0}" => "0",
"\x{80}\x{B1}" => "1",
"\x{80}\x{B2}" => "2",
"\x{80}\x{B3}" => "3",
"\x{80}\x{B4}" => "4",
"\x{80}\x{B5}" => "5",
"\x{80}\x{B6}" => "6",
"\x{80}\x{B7}" => "7",
"\x{80}\x{B8}" => "8",
"\x{80}\x{B9}" => "9",
"\x{80}\x{BA}" => ":",
"\x{80}\x{BB}" => ";",
"\x{80}\x{BC}" => "<",
"\x{80}\x{BD}" => "=",
"\x{80}\x{BE}" => ">",
"\x{80}\x{BF}" => "?",
"\x{81}\x{80}" => "\x{E2}\x{89}\x{85}",
"\x{81}\x{81}" => "\x{CE}\x{91}",
"\x{81}\x{82}" => "\x{CE}\x{92}",
"\x{81}\x{83}" => "\x{CE}\x{A7}",
"\x{81}\x{84}" => "\x{CE}\x{94}",
"\x{81}\x{85}" => "\x{CE}\x{95}",
"\x{81}\x{86}" => "\x{CE}\x{A6}",
"\x{81}\x{87}" => "\x{CE}\x{93}",
"\x{81}\x{88}" => "\x{CE}\x{97}",
"\x{81}\x{89}" => "\x{CE}\x{99}",
"\x{81}\x{8A}" => "\x{CF}\x{91}",
"\x{81}\x{8B}" => "\x{CE}\x{9A}",
"\x{81}\x{8C}" => "\x{CE}\x{9B}",
"\x{81}\x{8D}" => "\x{CE}\x{9C}",
"\x{81}\x{8E}" => "\x{CE}\x{9D}",
"\x{81}\x{8F}" => "\x{CE}\x{9F}",
"\x{81}\x{90}" => "\x{CE}\x{A0}",
"\x{81}\x{91}" => "\x{CE}\x{98}",
"\x{81}\x{92}" => "\x{CE}\x{A1}",
"\x{81}\x{93}" => "\x{CE}\x{A3}",
"\x{81}\x{94}" => "\x{CE}\x{A4}",
"\x{81}\x{95}" => "\x{CE}\x{A5}",
"\x{81}\x{96}" => "\x{CF}\x{82}",
"\x{81}\x{97}" => "\x{CE}\x{A9}",
"\x{81}\x{98}" => "\x{CE}\x{9E}",
"\x{81}\x{99}" => "\x{CE}\x{A8}",
"\x{81}\x{9A}" => "\x{CE}\x{96}",
"\x{81}\x{9B}" => "[",
"\x{81}\x{9C}" => "\x{E2}\x{88}\x{B4}",
"\x{81}\x{9D}" => "]",
"\x{81}\x{9E}" => "\x{E2}\x{8A}\x{A5}",
"\x{81}\x{9F}" => "_",
"\x{81}\x{A0}" => "\x{C2}\x{AF}",
"\x{81}\x{A1}" => "\x{CE}\x{B1}",
"\x{81}\x{A2}" => "\x{CE}\x{B2}",
"\x{81}\x{A3}" => "\x{CF}\x{87}",
"\x{81}\x{A4}" => "\x{CE}\x{B4}",
"\x{81}\x{A5}" => "\x{CE}\x{B5}",
"\x{81}\x{A6}" => "\x{CF}\x{95}",
"\x{81}\x{A7}" => "\x{CE}\x{B3}",
"\x{81}\x{A8}" => "\x{CE}\x{B7}",
"\x{81}\x{A9}" => "\x{CE}\x{B9}",
"\x{81}\x{AA}" => "\x{CF}\x{86}",
"\x{81}\x{AB}" => "\x{CE}\x{BA}",
"\x{81}\x{AC}" => "\x{CE}\x{BB}",
"\x{81}\x{AD}" => "\x{CE}\x{BC}",
"\x{81}\x{AE}" => "\x{CE}\x{BD}",
"\x{81}\x{AF}" => "\x{CE}\x{BF}",
"\x{81}\x{B0}" => "\x{CF}\x{80}",
"\x{81}\x{B1}" => "\x{CE}\x{B8}",
"\x{81}\x{B2}" => "\x{CF}\x{81}",
"\x{81}\x{B3}" => "\x{CF}\x{83}",
"\x{81}\x{B4}" => "\x{CF}\x{84}",
"\x{81}\x{B5}" => "\x{CF}\x{85}",
"\x{81}\x{B6}" => "\x{CF}\x{96}",
"\x{81}\x{B7}" => "\x{CF}\x{89}",
"\x{81}\x{B8}" => "\x{CE}\x{BE}",
"\x{81}\x{B9}" => "\x{CF}\x{88}",
"\x{81}\x{BA}" => "\x{CE}\x{B6}",
"\x{81}\x{BB}" => "{",
"\x{81}\x{BC}" => "|",
"\x{81}\x{BD}" => "}",
"\x{81}\x{BE}" => "~",
"\x{82}\x{A1}" => "\x{CF}\x{92}",
"\x{82}\x{A2}" => "\x{E2}\x{80}\x{B2}",
"\x{82}\x{A3}" => "\x{E2}\x{89}\x{A4}",
"\x{82}\x{A4}" => "/",
"\x{82}\x{A5}" => "\x{E2}\x{88}\x{9E}",
"\x{82}\x{A6}" => "f",
"\x{82}\x{A7}" => "\x{E2}\x{99}\x{A3}",
"\x{82}\x{A8}" => "\x{E2}\x{99}\x{A6}",
"\x{82}\x{A9}" => "\x{E2}\x{99}\x{A5}",
"\x{82}\x{AA}" => "\x{E2}\x{99}\x{A0}",
"\x{82}\x{AB}" => "\x{E2}\x{86}\x{94}",
"\x{82}\x{AC}" => "\x{E2}\x{86}\x{90}",
"\x{82}\x{AD}" => "\x{E2}\x{86}\x{91}",
"\x{82}\x{AE}" => "\x{E2}\x{86}\x{92}",
"\x{82}\x{AF}" => "\x{E2}\x{86}\x{93}",
"\x{82}\x{B0}" => "\x{C2}\x{B0}",
"\x{82}\x{B1}" => "\x{C2}\x{B1}",
"\x{82}\x{B2}" => "\x{E2}\x{80}\x{B3}",
"\x{82}\x{B3}" => "\x{E2}\x{89}\x{A5}",
"\x{82}\x{B4}" => "\x{C3}\x{97}",
"\x{82}\x{B5}" => "\x{E2}\x{88}\x{9D}",
"\x{82}\x{B6}" => "\x{E2}\x{88}\x{82}",
"\x{82}\x{B7}" => "\x{E2}\x{80}\x{A2}",
"\x{82}\x{B8}" => "\x{C3}\x{B7}",
"\x{82}\x{B9}" => "\x{E2}\x{89}\x{A0}",
"\x{82}\x{BA}" => "\x{E2}\x{89}\x{A1}",
"\x{82}\x{BB}" => "\x{E2}\x{89}\x{88}",
"\x{82}\x{BC}" => "\x{E2}\x{80}\x{A6}",
"\x{82}\x{BD}" => "|",
"\x{82}\x{BE}" => "\x{E2}\x{80}\x{94}",
"\x{82}\x{BF}" => "\x{E2}\x{86}\x{B5}",
"\x{83}\x{80}" => "\x{E2}\x{84}\x{B5}",
"\x{83}\x{81}" => "\x{E2}\x{84}\x{91}",
"\x{83}\x{82}" => "\x{E2}\x{84}\x{9C}",
"\x{83}\x{83}" => "\x{E2}\x{84}\x{98}",
"\x{83}\x{84}" => "\x{E2}\x{8A}\x{97}",
"\x{83}\x{85}" => "\x{E2}\x{8A}\x{95}",
"\x{83}\x{86}" => "\x{E2}\x{88}\x{85}",
"\x{83}\x{87}" => "\x{E2}\x{88}\x{A9}",
"\x{83}\x{88}" => "\x{E2}\x{88}\x{AA}",
"\x{83}\x{89}" => "\x{E2}\x{8A}\x{83}",
"\x{83}\x{8A}" => "\x{E2}\x{8A}\x{87}",
"\x{83}\x{8B}" => "\x{E2}\x{8A}\x{84}",
"\x{83}\x{8C}" => "\x{E2}\x{8A}\x{82}",
"\x{83}\x{8D}" => "\x{E2}\x{8A}\x{86}",
"\x{83}\x{8E}" => "\x{E2}\x{88}\x{88}",
"\x{83}\x{8F}" => "\x{E2}\x{88}\x{89}",
"\x{83}\x{90}" => "\x{E2}\x{88}\x{A0}",
"\x{83}\x{91}" => "\x{E2}\x{88}\x{87}",
"\x{83}\x{92}" => "\x{C2}\x{AE}",
"\x{83}\x{93}" => "\x{C2}\x{A9}",
"\x{83}\x{94}" => "\x{E2}\x{84}\x{A2}",
"\x{83}\x{95}" => "\x{E2}\x{88}\x{8F}",
"\x{83}\x{96}" => "\x{E2}\x{88}\x{9A}",
"\x{83}\x{97}" => "\x{E2}\x{88}\x{99}",
"\x{83}\x{98}" => "\x{C2}\x{AC}",
"\x{83}\x{99}" => "\x{E2}\x{88}\x{A7}",
"\x{83}\x{9A}" => "\x{E2}\x{88}\x{A8}",
"\x{83}\x{9B}" => "\x{E2}\x{87}\x{94}",
"\x{83}\x{9C}" => "\x{E2}\x{87}\x{90}",
"\x{83}\x{9D}" => "\x{E2}\x{87}\x{91}",
"\x{83}\x{9E}" => "\x{E2}\x{87}\x{92}",
"\x{83}\x{9F}" => "\x{E2}\x{87}\x{93}",
"\x{83}\x{A0}" => "\x{E2}\x{8B}\x{84}",
"\x{83}\x{A1}" => "\x{E3}\x{80}\x{88}",
"\x{83}\x{A2}" => "\x{C2}\x{AE}",
"\x{83}\x{A3}" => "\x{C2}\x{A9}",
"\x{83}\x{A4}" => "\x{E2}\x{84}\x{A2}",
"\x{83}\x{A5}" => "\x{E2}\x{88}\x{91}",
"\x{83}\x{A6}" => "\x{E2}\x{8E}\x{9B}",
"\x{83}\x{A7}" => "\x{E2}\x{8E}\x{9C}",
"\x{83}\x{A8}" => "\x{E2}\x{8E}\x{9D}",
"\x{83}\x{A9}" => "\x{E2}\x{8E}\x{A1}",
"\x{83}\x{AA}" => "\x{E2}\x{8E}\x{A2}",
"\x{83}\x{AB}" => "\x{E2}\x{8E}\x{A3}",
"\x{83}\x{AC}" => "\x{E2}\x{8E}\x{A7}",
"\x{83}\x{AD}" => "\x{E2}\x{8E}\x{A8}",
"\x{83}\x{AE}" => "\x{E2}\x{8E}\x{A9}",
"\x{83}\x{AF}" => "\x{E2}\x{8E}\x{AA}",
"\x{83}\x{B1}" => "\x{E3}\x{80}\x{89}",
"\x{83}\x{B2}" => "\x{E2}\x{88}\x{AB}",
"\x{83}\x{B3}" => "\x{E2}\x{8C}\x{A0}",
"\x{83}\x{B4}" => "\x{E2}\x{8E}\x{AE}",
"\x{83}\x{B5}" => "\x{E2}\x{8C}\x{A1}",
"\x{83}\x{B6}" => "\x{E2}\x{8E}\x{9E}",
"\x{83}\x{B7}" => "\x{E2}\x{8E}\x{9F}",
"\x{83}\x{B8}" => "\x{E2}\x{8E}\x{A0}",
"\x{83}\x{B9}" => "\x{E2}\x{8E}\x{A4}",
"\x{83}\x{BA}" => "\x{E2}\x{8E}\x{A5}",
"\x{83}\x{BB}" => "\x{E2}\x{8E}\x{A6}",
"\x{83}\x{BC}" => "\x{E2}\x{8E}\x{AB}",
"\x{83}\x{BD}" => "\x{E2}\x{8E}\x{AC}",
"\x{83}\x{BE}" => "\x{E2}\x{8E}\x{AD}",
);
s,\x{EF}(..),$symbol{$1},eg;
# Remove extra whitespace and print the result, always ending with \n.
s/\n{3,}/\n\n/sg;
s/[^\S\n]*\n/\n/g;
s,^\s*(.*?)\s*$,$1,s;
print "$_\n";