support multi-channel gray slide (23eb895b) · Commits · Jakub Hummel / openslide

src/openslide-decode-jxr.c

+11 −30

Original line number	Diff line number	Diff line
		@@ -78,7 +78,7 @@ static guint get_bits_per_pixel(const PKPixelFormatGUID *pixel_format) {
		return pixel_info.cbitUnit;
		}

		bool _openslide_jxr_decode_buf(const void src, int64_t src_len, uint32_t dst,
		bool _openslide_jxr_decode_buf(const void src, int64_t src_len, uint8_t dst,
		int64_t dst_len, GError **err) {
		struct WMPStream *pStream = NULL;
		PKImageDecode *pDecoder = NULL;
		@@ -86,7 +86,6 @@ bool _openslide_jxr_decode_buf(const void src, int64_t src_len, uint32_t dst,
		ERR jerr;
		PKPixelFormatGUID fmt;
		PKRect rect = {0, 0, 0, 0};
		g_autofree uint8_t *unjxr = NULL;

		CreateWS_Memory(&pStream, (void *) src, src_len);
		// IID_PKImageWmpDecode is the only supported decoder PKIID
		@@ -102,49 +101,33 @@ bool _openslide_jxr_decode_buf(const void src, int64_t src_len, uint32_t dst,
		}

		pDecoder->GetSize(pDecoder, &rect.Width, &rect.Height);
		int64_t out_len = rect.Width * rect.Height * 4;
		// JXR tile size may be incorrect in czi directory entries
		g_assert(out_len <= dst_len);

		pDecoder->GetPixelFormat(pDecoder, &fmt);
		PKPixelFormatGUID fmt_out;
		void (convert)(uint8_t , size_t, uint32_t *);
		if (IsEqualGUID(&fmt, &GUID_PKPixelFormat24bppBGR)) {
		fmt_out = GUID_PKPixelFormat24bppBGR;
		convert = _openslide_bgr24_to_argb32;
		} else if (IsEqualGUID(&fmt, &GUID_PKPixelFormat48bppRGB)) {
		/* Although the format called 48bppRGB in JXR, its color order is BGR for
		* czi. Use 48bppRGB as it is and prefer openslide function for converting
		* to argb32.
		*/
		fmt_out = GUID_PKPixelFormat48bppRGB;
		convert = _openslide_bgr48_to_argb32;
		} else if (IsEqualGUID(&fmt, &GUID_PKPixelFormat8bppGray)) {
		g_set_error(err, OPENSLIDE_ERROR, OPENSLIDE_ERROR_FAILED,
		"GUID_PKPixelFormat8bppGray is not supported");
		goto Cleanup;
		fmt_out = GUID_PKPixelFormat8bppGray;
		} else if (IsEqualGUID(&fmt, &GUID_PKPixelFormat16bppGray)) {
		g_set_error(err, OPENSLIDE_ERROR, OPENSLIDE_ERROR_FAILED,
		"GUID_PKPixelFormat16bppGray is not supported");
		goto Cleanup;
		fmt_out = GUID_PKPixelFormat16bppGray;
		} else {
		g_set_error(err, OPENSLIDE_ERROR, OPENSLIDE_ERROR_FAILED,
		"Currently only support GUID_PKPixelFormat24bppBGR and "
		"GUID_PKPixelFormat48bppRGB");
		"Currently only support "
		"GUID_PKPixelFormat24bppBGR, GUID_PKPixelFormat48bppRGB, "
		"GUID_PKPixelFormat8bppGray and GUID_PKPixelFormat16bppGray");
		goto Cleanup;
		}

		uint32_t stride =
		rect.Width *
		uint32_t stride = rect.Width *
		((MAX(get_bits_per_pixel(&fmt), get_bits_per_pixel(&fmt_out)) + 7) / 8);
		int64_t unjxr_len = stride * rect.Height;
		unjxr = g_try_malloc(unjxr_len);
		if (!unjxr) {
		g_set_error(err, OPENSLIDE_ERROR, OPENSLIDE_ERROR_FAILED,
		"Couldn't allocate %" PRId64 " bytes for decoding JXR",
		unjxr_len);
		return false;
		}
		int64_t unjxr_len = rect.Height * stride;
		// JXR tile size may be incorrect in czi directory entries
		g_assert(unjxr_len <= dst_len);

		// Create color converter
		jerr = PKCodecFactory_CreateFormatConverter(&pConverter);
		@@ -157,13 +140,11 @@ bool _openslide_jxr_decode_buf(const void src, int64_t src_len, uint32_t dst,
		goto Cleanup;
		}

		jerr = pConverter->Copy(pConverter, &rect, unjxr, stride);
		jerr = pConverter->Copy(pConverter, &rect, dst, stride);
		if (jerr < 0) {
		goto Cleanup;
		}

		convert(unjxr, unjxr_len, dst);

		Cleanup:
		print_err(jerr, err);
		CloseWS_Memory(&pStream);

src/openslide-decode-jxr.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -25,7 +25,7 @@
		#include <stdint.h>


		bool _openslide_jxr_decode_buf(const void src, int64_t src_len, uint32_t dst,
		bool _openslide_jxr_decode_buf(const void src, int64_t src_len, uint8_t dst,
		int64_t dst_len, GError **err);

		bool _openslide_jxr_dim(const void data, size_t data_len, uint32_t width,

src/openslide-image-avx2.c

+47 −0

Original line number	Diff line number	Diff line
		@@ -80,4 +80,51 @@ void _openslide_restore_czi_zstd1_avx2(uint8_t *src, size_t src_len,
		}
		}

		void _openslide_gray16_to_gray8_avx2(uint8_t *src, size_t src_len,
		int pixel_real_bits, uint8_t *dst) {
		/* sixteen 16-bits pixels a time */
		int nshift = pixel_real_bits - 8;
		const int mm_step = 32;
		/* Decrease mm_len by 1 so that the last write is still 16 bytes inside
		* dst buffer.
		*/
		size_t mm_len = src_len / mm_step - 1;
		__m256i gray8, tmp1, tmp2;
		__m256i hi8 = _mm256_setr_epi8(
		1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1,
		1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1);
		__m256i lo8 = _mm256_setr_epi8(
		0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1,
		0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1);
		__m256i allzero = _mm256_set_epi64x(0, 0, 0, 0);
		for (size_t n = 0; n < mm_len; n++) {
		tmp1 = _mm256_lddqu_si256((__m256i const *)src); // gray16
		tmp2 = _mm256_srli_epi16(tmp1, nshift); // right shift
		gray8 = _mm256_shuffle_epi8(tmp2, lo8); // bits 0-7 of gray16

		/* check after right shift, whether the high 8 bits are non-zero. Sometimes
		* 14 bits zeiss gray uses more than 14 bits.
		*/
		tmp1 = _mm256_shuffle_epi8(tmp2, hi8); // bits 8-15 of gray16
		/* 0xFF if high 8 bits is non-zero, 0 otherwise. The sign bit of high 8
		* bits is always zero since it has been shift right, therefor it is safe to
		* compare signed with 0.
		*/
		tmp2 = _mm256_cmpgt_epi8(tmp1, allzero);
		tmp1 = _mm256_or_si256(tmp2, gray8);
		tmp2 = _mm256_permute4x64_epi64(tmp1, 0x08);
		_mm256_storeu_si256((__m256i *)dst, tmp2);

		src += mm_step;
		dst += 16;
		}

		size_t i = mm_len * mm_step;
		while (i < src_len) {
		*dst++ = gray16togray8(src, nshift);
		i += 2;
		src += 2;
		}
		}

		#endif

src/openslide-image-ssse3.c

+44 −0

Original line number	Diff line number	Diff line
		@@ -76,4 +76,48 @@ void _openslide_restore_czi_zstd1_sse3(uint8_t *src, size_t src_len,
		}
		}

		void _openslide_gray16_to_gray8_sse2(uint8_t *src, size_t src_len,
		int pixel_real_bits, uint8_t *dst) {
		/* eight 16-bits pixels a time */
		int nshift = pixel_real_bits - 8;
		const int mm_step = 16;
		/* Decrease mm_len by 1 so that the last write is still 16 bytes inside
		* dst buffer.
		*/
		size_t mm_len = src_len / mm_step - 1;
		__m128i gray8, gray16, tmp1, tmp2;
		__m128i hi8 =
		_mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1);
		__m128i lo8 =
		_mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1);
		__m128i allzero = _mm_set_epi64x(0, 0);

		for (size_t n = 0; n < mm_len; n++) {
		gray16 = _mm_load_si128((__m128i const *)src);
		tmp2 = _mm_srli_epi16(gray16, nshift);
		gray8 = _mm_shuffle_epi8(tmp2, lo8);
		/* check after right shift, whether the high 8 bits are non-zero. Sometimes
		* 14 bits zeiss gray uses more than 14 bits.
		*/
		tmp1 = _mm_shuffle_epi8(tmp2, hi8);
		/* 0xFF if high 8 bits is non-zero, 0 otherwise. The sign bit of high 8
		* bits is always zero since it has been shift right, therefor it is safe to
		* compare signed with 0.
		*/
		tmp2 = _mm_cmpgt_epi8(tmp1, allzero);
		tmp1 = _mm_or_si128(tmp2, gray8);
		_mm_storeu_si128((__m128i *)dst, tmp1);

		src += mm_step;
		dst += 8;
		}

		size_t i = mm_len * mm_step;
		while (i < src_len) {
		*dst++ = gray16togray8(src, nshift);
		i += 2;
		src += 2;
		}
		}

		#endif

src/openslide-image.c

+90 −0

Original line number	Diff line number	Diff line
		#include <config.h>
		#include <glib.h>
		#include "openslide-image.h"

		static void bgr24_to_argb32_dispatch(uint8_t *src, size_t src_len,
		@@ -14,6 +15,11 @@ static void restore_czi_zstd1_dispatch(uint8_t *src, size_t src_len,
		static void restore_czi_zstd1_generic(uint8_t *src, size_t src_len,
		uint8_t *dst);

		static void gray16_to_gray8_dispatch(uint8_t *src, size_t src_len,
		int pixel_real_bits, uint8_t *dst);
		static void gray16_to_gray8_generic(uint8_t *src, size_t src_len,
		int pixel_real_bits, uint8_t *dst);

		#ifdef USE_NEON
		static void bgr24_to_argb32_neon(uint8_t src, size_t src_len, uint32_t dst);
		static void restore_czi_zstd1_neon(uint8_t src, size_t src_len, uint8_t dst);
		@@ -26,6 +32,9 @@ _openslide_bgr_convert_t _openslide_bgr48_to_argb32 = &bgr48_to_argb32_dispatch;
		_openslide_restore_czi_zstd1_t _openslide_restore_czi_zstd1 =
		&restore_czi_zstd1_dispatch;

		_openslide_gray16_to_gray8_t _openslide_gray16_to_gray8 =
		&gray16_to_gray8_dispatch;

		static void bgr24_to_argb32_generic(uint8_t *src, size_t src_len,
		uint32_t *dst) {
		// one 24-bit pixel at a time
		@@ -116,6 +125,87 @@ static void restore_czi_zstd1_dispatch(uint8_t *src, size_t src_len,
		return restore_czi_zstd1_generic(src, src_len, dst);
		}

		uint8_t gray16togray8(uint8_t *p, int ns) {
		uint16_t v = ((uint16_t )p) >> ns;

		/* 14 bits gray image in zeiss Axioscan7 sometimes uses more than 14 bits,
		* these pixels appear black if treated as 14 bits */
		// sadly, conditional makes convert at least 15% slower
		return (v > 255) ? 255 : (uint8_t)v;
		}

		static void gray16_to_gray8_generic(uint8_t *src, size_t src_len,
		int pixel_real_bits, uint8_t *dst) {
		int nshift = pixel_real_bits - 8;
		size_t i = 0;
		while (i < src_len) {
		*dst++ = gray16togray8(src, nshift);
		i += 2;
		src += 2;
		}
		}

		/* padding rows in image so that they align to 4 byte boundary */
		void _openslide_add_row_padding(uint8_t src, size_t src_len, uint8_t dst,
		size_t dst_len, int pixel_bytes, int32_t w,
		int32_t h) {
		int32_t stride = CAIRO_STRIDE_FOR_WIDTH_BPP(w, pixel_bytes * 8);
		int32_t w_bytes = w * pixel_bytes;

		g_assert((size_t) h * w_bytes == src_len);
		g_assert((size_t) h * stride == dst_len);

		for (int32_t row = 0; row < h; row++) {
		memcpy(dst, src, w_bytes);
		src += w_bytes;
		dst += stride;
		}
		}

		/* remove 4 byte alignment padding from image rows */
		void _openslide_del_row_padding(uint8_t src, size_t src_len, uint8_t dst,
		size_t dst_len, int pixel_bytes, int32_t w,
		int32_t h) {
		int32_t stride = CAIRO_STRIDE_FOR_WIDTH_BPP(w, pixel_bytes * 8);
		int32_t w_bytes = w * pixel_bytes;

		g_assert((size_t) h * stride == src_len);
		g_assert((size_t) h * w_bytes == dst_len);

		for (int32_t row = 0; row < h; row++) {
		memcpy(dst, src, w_bytes);
		src += stride;
		dst += w_bytes;
		}
		}

		/*
		* non-SIMD: 1.91 GB/s
		* SSE2: 3.70 GB/s, 1.94x
		* AVX2: 4.01 GB/s, 2.10x
		*/
		static void gray16_to_gray8_dispatch(uint8_t *src, size_t src_len,
		int pixel_real_bits, uint8_t *dst) {
		#ifdef USE_AVX2
		if (__builtin_cpu_supports("avx2")) {
		_openslide_gray16_to_gray8 = &_openslide_gray16_to_gray8_avx2;
		_openslide_gray16_to_gray8(src, src_len, pixel_real_bits, dst);
		return;
		}
		#endif
		#ifdef USE_SSSE3
		if (__builtin_cpu_supports("sse3")) {
		_openslide_gray16_to_gray8 = &_openslide_gray16_to_gray8_sse2;
		_openslide_gray16_to_gray8(src, src_len, pixel_real_bits, dst);
		return;
		}
		#endif

		_openslide_gray16_to_gray8 = &gray16_to_gray8_generic;
		_openslide_gray16_to_gray8(src, src_len, pixel_real_bits, dst);
		return;
		}

		#ifdef USE_NEON
		#include <arm_neon.h>