Michael Sperber <sperber_at_informatik.uni-tuebingen.de> writes:
> c) As the proposed design implies buffering at some level, and the
> transfer of binary data to arbitrary bytes objects, it is at
> least quite difficult to implement zero-copy I/O, facilities for
> which are offered by many modern operating systems.
I think zero-copy I/O, as offered by the OS, supports only binary I/O
of whole memory-page mappings in buffers aligned to whole pages. This
is hard to interface to stream-based textual I/O.
I vaguely remember that someone has once tried to base I/O in Haskell
on mmap() instead of read(), and reported that it was not worth the
effort. Troubles include using a lot of address space, limiting file
size to available address space, unspecified behavior when the file
changes the size while it is mapped, and the method being not
applicable to sockets or pipes. Mmap has its uses, but it should be
a conscious decision of the application, and it needs different usage
patterns: treating the file as an array rather than as a stream with a
current file pointer. Implementing stream I/O in terms of mmap is too
problematic.
> The design of the (r6rs ports) library in the current draft, despite
> its numerous flaws, has the nice property that binary and textual I/O
> can be interleaved arbitrarily on the same port.
It's impossible to implement efficiently for transcoders implemented
externally.
I've made a test. Here is a an implementation of a fragment of an I/O
stack, roughly based on my design, but with resizable buffers changed
to fixed-size buffers, and translated to C. There are two variants of
the actual stack used (from bottom to top):
1. test_buffer_chars:
* raw file
* block iconv decoder (with a byte buffer)
* buffered char input (with a char buffer)
2. test_buffer_bytes:
* raw_file
* buffered byte input which uses iconv to decode each character
(with a byte buffer)
In both cases individual characters of a 50 MB file are obtained and
ignored.
The first design uses more copying between buffers, but it calls
iconv() once per block instead of once per character, and is 10 times
faster for "ISO-8859-2".
Even when iconv calls are replaced with a simple array lookup
(assuming a 1-1 mapping between bytes and characters), the time in
both designs is the same (the difference is smaller than variations
between runs).
Compiled on Linux with gcc -O2 -fomit-frame-pointer
#include <stdbool.h>
#include <wchar.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <iconv.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
typedef unsigned char byte_t;
#define BUFFER_SIZE 4096
#define EXTERNAL_ENCODING "ISO-8859-2"
#define WCHAR_ENCODING "wchar_t"
__attribute__((noreturn))
void error(const char *str) {
fprintf(stderr, "%s (%s)\n", str, strerror(errno));
exit(1);
}
typedef bool (*read_some_bytes)(void *self, byte_t **buf_ptr,
size_t *size_ptr);
struct byte_input {
read_some_bytes read_some;
};
typedef bool (*read_some_chars)(void *self, wchar_t **buf_ptr,
size_t *size_ptr);
struct char_input {
read_some_chars read_some;
};
//// raw_file
struct raw_file {
read_some_bytes read_some;
int fd;
};
static bool
read_some_raw_file(void *self, byte_t **buf_ptr, size_t *size_ptr) {
struct raw_file *file = (struct raw_file *)self;
ssize_t size_out = read(file->fd, *buf_ptr, *size_ptr);
if (size_out < 0) error("read");
if (size_out == 0) return true;
*buf_ptr += size_out;
*size_ptr -= size_out;
return false;
}
void
open_raw_file(struct raw_file *s, const char *name) {
s->read_some = read_some_raw_file;
s->fd = open(name, O_RDONLY);
if (s->fd < 0) error("open");
}
//// iconv_decoder
struct iconv_decoder {
read_some_chars read_some;
struct byte_input *base;
iconv_t cd;
byte_t buf[BUFFER_SIZE], *begin, *end;
size_t size_after_end;
bool overflow;
};
static bool
read_some_iconv_decoder(void *self, wchar_t **buf_ptr, size_t *size_ptr) {
struct iconv_decoder *decoder = (struct iconv_decoder *)self;
bool end = false;
if (!decoder->overflow) {
size_t remaining = decoder->end - decoder->begin;
memmove(decoder->buf, decoder->begin, remaining);
decoder->begin = decoder->buf;
decoder->end = decoder->buf + remaining;
decoder->size_after_end = BUFFER_SIZE - remaining;
end = decoder->base->read_some(decoder->base, &decoder->end,
&decoder->size_after_end);
}
size_t input_size = decoder->end - decoder->begin;
size_t output_size = *size_ptr * sizeof(wchar_t);
size_t status = iconv(decoder->cd,
(char **)&decoder->begin, &input_size,
(char **)buf_ptr, &output_size);
decoder->begin = decoder->end - input_size;
*size_ptr = output_size / sizeof(wchar_t);
if (status == (size_t)-1) {
switch (errno) {
case E2BIG:
decoder->overflow = true;
return false;
case EINVAL:
break;
default:
error("iconv");
}
}
decoder->overflow = false;
return end;
}
void
init_iconv_decoder(struct iconv_decoder *s, void *base) {
s->read_some = read_some_iconv_decoder;
s->base = (struct byte_input *)base;
s->cd = iconv_open(WCHAR_ENCODING, EXTERNAL_ENCODING);
if (s->cd == (iconv_t)-1)
error("iconv_open");
s->begin = s->buf;
s->end = s->buf;
s->size_after_end = BUFFER_SIZE;
s->overflow = false;
}
//// buffered_char_input
struct buffered_char_input {
struct char_input *base;
wchar_t buf[BUFFER_SIZE], *begin, *end;
size_t size_after_end;
};
wchar_t
read_char(struct buffered_char_input *s) {
if (s->begin != s->end)
return *s->begin++;
s->begin = s->buf;
s->end = s->buf;
s->size_after_end = BUFFER_SIZE;
for(;;) {
bool last = s->base->read_some(s->base, &s->end,
&s->size_after_end);
if (s->begin != s->end)
return *s->begin++;
if (last)
return WEOF;
}
}
void
init_buffered_char_input(struct buffered_char_input *s,
void *base) {
s->base = (struct char_input *)base;
s->begin = s->buf;
s->end = s->buf;
s->size_after_end = BUFFER_SIZE;
}
//// buffered_decoded_input
static wchar_t trans[256];
struct buffered_decoded_input {
struct byte_input *base;
byte_t buf[BUFFER_SIZE], *begin, *end;
size_t size_after_end;
iconv_t cd;
};
wchar_t
read_decoded_char(struct buffered_decoded_input *s) {
bool last = false;
for(;;) {
if (s->begin != s->end) {
// return trans[*s->begin++];
wchar_t ch;
size_t input_size = s->end - s->begin;
char *output_buf = (char *)&ch;
size_t output_size = sizeof(wchar_t);
size_t status = iconv(s->cd,
(char **)&s->begin, &input_size,
&output_buf, &output_size);
s->begin = s->end - input_size;
if (output_size == 0)
return ch;
if (status == (size_t)-1) {
if (errno == EINVAL) break;
error("iconv");
}
}
if (last)
return WEOF;
size_t remaining = s->end - s->begin;
memmove(s->buf, s->begin, remaining);
s->begin = s->buf;
s->end = s->buf + remaining;
s->size_after_end = BUFFER_SIZE - remaining;
last = s->base->read_some(s->base, &s->end,
&s->size_after_end);
}
}
void
init_buffered_decoded_input(struct buffered_decoded_input *s,
void *base) {
s->base = (struct byte_input *)base;
s->begin = s->buf;
s->end = s->buf;
s->size_after_end = BUFFER_SIZE;
s->cd = iconv_open(WCHAR_ENCODING, EXTERNAL_ENCODING);
if (s->cd == (iconv_t)-1)
error("iconv_open");
}
//// main
static void
test_buffer_chars(const char *filename) {
struct raw_file raw_file;
open_raw_file(&raw_file, filename);
struct iconv_decoder iconv_decoder;
init_iconv_decoder(&iconv_decoder, &raw_file);
struct buffered_char_input buffered_char_input;
init_buffered_char_input(&buffered_char_input, &iconv_decoder);
for(;;) {
wchar_t ch = read_char(&buffered_char_input);
if (ch == WEOF) break;
/*
if (ch <= 0x7E)
putchar(ch);
else
printf("\\x%X;", (int)ch);
*/
}
}
static void
test_buffer_bytes(const char *filename) {
struct raw_file raw_file;
open_raw_file(&raw_file, filename);
struct buffered_decoded_input buffered_decoded_input;
init_buffered_decoded_input(&buffered_decoded_input, &raw_file);
for(;;) {
wchar_t ch = read_decoded_char(&buffered_decoded_input);
if (ch == WEOF) break;
/*
if (ch <= 0x7E)
putchar(ch);
else
printf("\\x%X;", (int)ch);
*/
}
}
int
main(int argc, char **argv) {
int ch;
for (ch = 0; ch < 256; ++ch)
trans[ch] = ch;
if (argc != 2) {
fprintf(stderr, "Usage: %s FILENAME\n", argv[0]);
exit(1);
}
struct timeval time0, time1;
gettimeofday(&time0, NULL);
test_buffer_chars(argv[1]);
// test_buffer_bytes(argv[1]);
gettimeofday(&time1, NULL);
if (time1.tv_usec < time0.tv_usec) {
time1.tv_usec += 1000000;
--time1.tv_sec;
}
time1.tv_usec -= time0.tv_usec;
time1.tv_sec -= time0.tv_sec;
printf("Time: %.3fs\n",
(double)time1.tv_sec + (double)time1.tv_usec / 1000000.0);
return 0;
}
--
__("< Marcin Kowalczyk
\__/ qrczak_at_knm.org.pl
^^ http://qrnik.knm.org.pl/~qrczak/
Received on Sat Nov 25 2006 - 13:37:42 UTC