From ca0212ba19b64488b9e8459a762c11ecd6e7d0bd Mon Sep 17 00:00:00 2001 From: Petr Stodulka Date: Tue, 24 Nov 2015 17:56:11 +0100 Subject: [PATCH] print correctly non-ascii filenames --- extract.c | 289 ++++++++++++++++++++++++++++++++++++++++++++++++-------------- unzpriv.h | 7 ++ 2 files changed, 233 insertions(+), 63 deletions(-) diff --git a/extract.c b/extract.c index 0ee4e93..741b7e0 100644 --- a/extract.c +++ b/extract.c @@ -2648,8 +2648,21 @@ static void set_deferred_symlink(__G__ slnk_entry) } /* end function set_deferred_symlink() */ #endif /* SYMLINKS */ +/* + * If Unicode is supported, assume we have what we need to do this + * check using wide characters, avoiding MBCS issues. + */ - +#ifndef UZ_FNFILTER_REPLACECHAR + /* A convenient choice for the replacement of unprintable char codes is + * the "single char wildcard", as this character is quite unlikely to + * appear in filenames by itself. The following default definition + * sets the replacement char to a question mark as the most common + * "single char wildcard"; this setting should be overridden in the + * appropiate system-specific configuration header when needed. + */ +# define UZ_FNFILTER_REPLACECHAR '?' +#endif /*************************/ /* Function fnfilter() */ /* here instead of in list.c for SFX */ @@ -2661,48 +2674,168 @@ char *fnfilter(raw, space, size) /* convert name to safely printable form */ extent size; { #ifndef NATIVE /* ASCII: filter ANSI escape codes, etc. */ - ZCONST uch *r=(ZCONST uch *)raw; + ZCONST uch *r; // =(ZCONST uch *)raw; uch *s=space; uch *slim=NULL; uch *se=NULL; int have_overflow = FALSE; - if (size > 0) { - slim = space + size -#ifdef _MBCS - - (MB_CUR_MAX - 1) -#endif - - 4; +# if defined( UNICODE_SUPPORT) && defined( _MBCS) +/* If Unicode support is enabled, and we have multi-byte characters, + * then do the isprint() checks by first converting to wide characters + * and checking those. This avoids our having to parse multi-byte + * characters for ourselves. After the wide-char replacements have been + * made, the wide string is converted back to the local character set. + */ + wchar_t *wstring; /* wchar_t version of raw */ + size_t wslen; /* length of wstring */ + wchar_t *wostring; /* wchar_t version of output string */ + size_t woslen; /* length of wostring */ + char *newraw; /* new raw */ + + /* 2012-11-06 SMS. + * Changed to check the value returned by mbstowcs(), and bypass the + * Unicode processing if it fails. This seems to fix a problem + * reported in the SourceForge forum, but it's not clear that we + * should be doing any Unicode processing without some evidence that + * the name actually is Unicode. (Check bit 11 in the flags before + * coming here?) + * http://sourceforge.net/p/infozip/bugs/40/ + */ + + if (MB_CUR_MAX <= 1) + { + /* There's no point to converting multi-byte chars if there are + * no multi-byte chars. + */ + wslen = (size_t)-1; } - while (*r) { - if (size > 0 && s >= slim && se == NULL) { - se = s; + else + { + /* Get Unicode wide character count (for storage allocation). */ + wslen = mbstowcs( NULL, raw, 0); + } + + if (wslen != (size_t)-1) + { + /* Apparently valid Unicode. Allocate wide-char storage. */ + wstring = (wchar_t *)malloc((wslen + 1) * sizeof(wchar_t)); + if (wstring == NULL) { + strcpy( (char *)space, raw); + return (char *)space; } -#ifdef QDOS - if (qlflag & 2) { - if (*r == '/' || *r == '.') { + wostring = (wchar_t *)malloc(2 * (wslen + 1) * sizeof(wchar_t)); + if (wostring == NULL) { + free(wstring); + strcpy( (char *)space, raw); + return (char *)space; + } + + /* Convert the multi-byte Unicode to wide chars. */ + wslen = mbstowcs(wstring, raw, wslen + 1); + + /* Filter the wide-character string. */ + fnfilterw( wstring, wostring, (2 * (wslen + 1) * sizeof(wchar_t))); + + /* Convert filtered wide chars back to multi-byte. */ + woslen = wcstombs( NULL, wostring, 0); + if ((newraw = malloc(woslen + 1)) == NULL) { + free(wstring); + free(wostring); + strcpy( (char *)space, raw); + return (char *)space; + } + woslen = wcstombs( newraw, wostring, (woslen * MB_CUR_MAX) + 1); + + if (size > 0) { + slim = space + size - 4; + } + r = (ZCONST uch *)newraw; + while (*r) { + if (size > 0 && s >= slim && se == NULL) { + se = s; + } +# ifdef QDOS + if (qlflag & 2) { + if (*r == '/' || *r == '.') { + if (se != NULL && (s > (space + (size-3)))) { + have_overflow = TRUE; + break; + } + ++r; + *s++ = '_'; + continue; + } + } else +# endif + { if (se != NULL && (s > (space + (size-3)))) { have_overflow = TRUE; break; } - ++r; - *s++ = '_'; - continue; + *s++ = *r++; } - } else + } + if (have_overflow) { + strcpy((char *)se, "..."); + } else { + *s = '\0'; + } + + free(wstring); + free(wostring); + free(newraw); + } + else +# endif /* defined( UNICODE_SUPPORT) && defined( _MBCS) */ + { + /* No Unicode support, or apparently invalid Unicode. */ + r = (ZCONST uch *)raw; + + if (size > 0) { + slim = space + size +#ifdef _MBCS + - (MB_CUR_MAX - 1) +#endif + - 4; + } + while (*r) { + if (size > 0 && s >= slim && se == NULL) { + se = s; + } +#ifdef QDOS + if (qlflag & 2) { + if (*r == '/' || *r == '.') { + if (se != NULL && (s > (space + (size-3)))) { + have_overflow = TRUE; + break; + } + ++r; + *s++ = '_'; + continue; + } + } else #endif #ifdef HAVE_WORKING_ISPRINT -# ifndef UZ_FNFILTER_REPLACECHAR - /* A convenient choice for the replacement of unprintable char codes is - * the "single char wildcard", as this character is quite unlikely to - * appear in filenames by itself. The following default definition - * sets the replacement char to a question mark as the most common - * "single char wildcard"; this setting should be overridden in the - * appropiate system-specific configuration header when needed. - */ -# define UZ_FNFILTER_REPLACECHAR '?' -# endif - if (!isprint(*r)) { + if (!isprint(*r)) { + if (*r < 32) { + /* ASCII control codes are escaped as "^{letter}". */ + if (se != NULL && (s > (space + (size-4)))) { + have_overflow = TRUE; + break; + } + *s++ = '^', *s++ = (uch)(64 + *r++); + } else { + /* Other unprintable codes are replaced by the + * placeholder character. */ + if (se != NULL && (s > (space + (size-3)))) { + have_overflow = TRUE; + break; + } + *s++ = UZ_FNFILTER_REPLACECHAR; + INCSTR(r); + } +#else /* !HAVE_WORKING_ISPRINT */ if (*r < 32) { /* ASCII control codes are escaped as "^{letter}". */ if (se != NULL && (s > (space + (size-4)))) { @@ -2710,47 +2843,30 @@ char *fnfilter(raw, space, size) /* convert name to safely printable form */ break; } *s++ = '^', *s++ = (uch)(64 + *r++); +#endif /* ?HAVE_WORKING_ISPRINT */ } else { - /* Other unprintable codes are replaced by the - * placeholder character. */ +#ifdef _MBCS + unsigned i = CLEN(r); + if (se != NULL && (s > (space + (size-i-2)))) { + have_overflow = TRUE; + break; + } + for (; i > 0; i--) + *s++ = *r++; +#else if (se != NULL && (s > (space + (size-3)))) { have_overflow = TRUE; break; } - *s++ = UZ_FNFILTER_REPLACECHAR; - INCSTR(r); - } -#else /* !HAVE_WORKING_ISPRINT */ - if (*r < 32) { - /* ASCII control codes are escaped as "^{letter}". */ - if (se != NULL && (s > (space + (size-4)))) { - have_overflow = TRUE; - break; - } - *s++ = '^', *s++ = (uch)(64 + *r++); -#endif /* ?HAVE_WORKING_ISPRINT */ - } else { -#ifdef _MBCS - unsigned i = CLEN(r); - if (se != NULL && (s > (space + (size-i-2)))) { - have_overflow = TRUE; - break; - } - for (; i > 0; i--) *s++ = *r++; -#else - if (se != NULL && (s > (space + (size-3)))) { - have_overflow = TRUE; - break; - } - *s++ = *r++; #endif - } - } - if (have_overflow) { - strcpy((char *)se, "..."); - } else { - *s = '\0'; + } + } + if (have_overflow) { + strcpy((char *)se, "..."); + } else { + *s = '\0'; + } } #ifdef WINDLL @@ -2772,6 +2888,53 @@ char *fnfilter(raw, space, size) /* convert name to safely printable form */ } /* end function fnfilter() */ +#if defined( UNICODE_SUPPORT) && defined( _MBCS) + +/****************************/ +/* Function fnfilter[w]() */ /* (Here instead of in list.c for SFX.) */ +/****************************/ + +/* fnfilterw() - Convert wide name to safely printable form. */ + +/* fnfilterw() - Convert wide-character name to safely printable form. */ + +wchar_t *fnfilterw( src, dst, siz) + ZCONST wchar_t *src; /* Pointer to source char (string). */ + wchar_t *dst; /* Pointer to destination char (string). */ + extent siz; /* Not used (!). */ +{ + wchar_t *dsx = dst; + + /* Filter the wide chars. */ + while (*src) + { + if (iswprint( *src)) + { + /* Printable code. Copy it. */ + *dst++ = *src; + } + else + { + /* Unprintable code. Substitute something printable for it. */ + if (*src < 32) + { + /* Replace ASCII control code with "^{letter}". */ + *dst++ = (wchar_t)'^'; + *dst++ = (wchar_t)(64 + *src); + } + else + { + /* Replace other unprintable code with the placeholder. */ + *dst++ = (wchar_t)UZ_FNFILTER_REPLACECHAR; + } + } + src++; + } + *dst = (wchar_t)0; /* NUL-terminate the destination string. */ + return dsx; +} /* fnfilterw(). */ + +#endif /* defined( UNICODE_SUPPORT) && defined( _MBCS) */ #ifdef SET_DIR_ATTRIB diff --git a/unzpriv.h b/unzpriv.h index 22d3923..e48a652 100644 --- a/unzpriv.h +++ b/unzpriv.h @@ -1212,6 +1212,7 @@ # ifdef UNICODE_WCHAR # if !(defined(_WIN32_WCE) || defined(POCKET_UNZIP)) # include +# include # endif # endif # ifndef _MBCS /* no need to include twice, see below */ @@ -2410,6 +2411,12 @@ int memflush OF((__GPRO__ ZCONST uch *rawbuf, ulg size)); char *fnfilter OF((ZCONST char *raw, uch *space, extent size)); +# if defined( UNICODE_SUPPORT) && defined( _MBCS) +wchar_t *fnfilterw OF((ZCONST wchar_t *src, wchar_t *dst, + extent siz)); +#endif + + /*--------------------------------------------------------------------------- Decompression functions: ---------------------------------------------------------------------------*/ -- 2.4.3