file-5.05-jtext.patch 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. --- file-5.05/src/Makefile.am.vinejtext 2010-07-22 00:56:10.000000000 +0900
  2. +++ file-5.05/src/Makefile.am 2011-02-11 16:53:06.000000000 +0900
  3. @@ -4,11 +4,11 @@
  4. bin_PROGRAMS = file
  5. -AM_CPPFLAGS = -DMAGIC='"$(MAGIC)"'
  6. +AM_CPPFLAGS = -DMAGIC='"$(MAGIC)"' -DDETECT_JAPANESE
  7. AM_CFLAGS = @WARNINGS@
  8. libmagic_la_SOURCES = magic.c apprentice.c softmagic.c ascmagic.c \
  9. - encoding.c compress.c is_tar.c readelf.c print.c fsmagic.c \
  10. + encoding.c compress.c is_tar.c readelf.c print.c jcode.c fsmagic.c \
  11. funcs.c file.h names.h patchlevel.h readelf.h tar.h apptype.c \
  12. file_opts.h elfclass.h mygetopt.h cdf.c cdf_time.c readcdf.c cdf.h
  13. libmagic_la_LDFLAGS = -no-undefined -version-info 1:0:0
  14. --- file-5.05/src/encoding.c.vinejtext 2010-07-22 01:47:17.000000000 +0900
  15. +++ file-5.05/src/encoding.c 2011-02-11 17:26:00.000000000 +0900
  16. @@ -42,7 +42,7 @@ FILE_RCSID("@(#)$File: encoding.c,v 1.5
  17. #include <string.h>
  18. #include <memory.h>
  19. #include <stdlib.h>
  20. -
  21. +#include "jcode.h"
  22. private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
  23. private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
  24. @@ -68,7 +68,7 @@ protected int
  25. file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type)
  26. {
  27. size_t mlen;
  28. - int rv = 1, ucs_type;
  29. + int rv = 1, ucs_type, jcode;
  30. unsigned char *nbuf = NULL;
  31. mlen = (nbytes + 1) * sizeof(nbuf[0]);
  32. @@ -83,10 +83,27 @@ file_encoding(struct magic_set *ms, cons
  33. }
  34. *type = "text";
  35. - if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
  36. + jcode = detect_kcode(buf, nbytes, *ubuf, ulen);
  37. + if (jcode == ASCII) {
  38. DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
  39. *code = "ASCII";
  40. *code_mime = "us-ascii";
  41. + } else if (jcode == JIS) {
  42. + DPRINTF(("jis %" SIZE_T_FORMAT "u\n", *ulen));
  43. + *code = "7-bit JIS [ESC$B, ESC(B]";
  44. + *code_mime = "jis";
  45. + } else if (jcode == SJIS){
  46. + DPRINTF(("sjis %" SIZE_T_FORMAT "u\n", *ulen));
  47. + *code = "SJIS";
  48. + *code_mime = "sjis";
  49. + } else if (jcode == EUC){
  50. + DPRINTF(("euc %" SIZE_T_FORMAT "u\n", *ulen));
  51. + *code = "EUC";
  52. + *code_mime = "euc-jp";
  53. + } else if (jcode == EUCORSJIS){
  54. + DPRINTF(("euc or sjis %" SIZE_T_FORMAT "u\n", *ulen));
  55. + *code = "EUC or SJIS";
  56. + *code_mime = "unknown";
  57. } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
  58. DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
  59. *code = "UTF-8 Unicode (with BOM)";
  60. --- /dev/null 2011-02-06 21:11:58.373999997 +0900
  61. +++ file-5.05/src/jcode.c 2011-02-11 17:14:29.000000000 +0900
  62. @@ -0,0 +1,205 @@
  63. +/*
  64. +jcode.c: Kanji-code detect routing by Jun Nishii <jun@vinelinux.org>
  65. + modified by Ryoichi INAGAKI <inagaki@vinelinux.org>
  66. + */
  67. +#include <stdio.h>
  68. +#include <unistd.h>
  69. +#include <signal.h>
  70. +#include <sys/types.h>
  71. +#include <sys/wait.h>
  72. +#include <jcode.h>
  73. +
  74. +typedef unsigned long unichar;
  75. +
  76. +#define F 0 /* character never appears in text */
  77. +#define T 1 /* character appears in plain ASCII text */
  78. +#define I 2 /* character appears in ISO-8859 text */
  79. +#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
  80. +#define J 4 /* character appears in JIS or plain ASCII */
  81. +#define S 5 /* character appears in SJIS */
  82. +#define E 6 /* character appears in EUC */
  83. +#define O 7 /* character appears in EUC or SJIS */
  84. +
  85. +#define ESC 27
  86. +
  87. +static char jp_chars1[256] = {
  88. + F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
  89. + F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
  90. + T, J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, /* 0x2X */
  91. + J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, /* 0x3X */
  92. + J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, /* 0x4X */
  93. + J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, /* 0x5X */
  94. + J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, /* 0x6X */
  95. + J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, F, /* 0x7X */
  96. + /* NEL */
  97. + X, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x8X */
  98. + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x9X */
  99. + I, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, /* 0xaX */
  100. + E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, /* 0xbX */
  101. + E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, /* 0xcX */
  102. + E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, /* 0xdX */
  103. + O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xeX */
  104. + E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, I /* 0xfX */
  105. +};
  106. +
  107. +static char jp_chars2[256] = {
  108. + /* BEL BS HT LF FF CR */
  109. + F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
  110. + /* ESC */
  111. + F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
  112. + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
  113. + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
  114. + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x4X */
  115. + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x5X */
  116. + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x6X */
  117. + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, F, /* 0x7X */
  118. + /* NEL */
  119. + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x8X */
  120. + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x9X */
  121. + S, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xaX */
  122. + O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xbX */
  123. + O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xcX */
  124. + O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xdX */
  125. + O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xeX */
  126. + O, O, O, O, O, O, O, O, O, O, O, O, O, E, E, I /* 0xfX */
  127. +};
  128. +
  129. +
  130. +int
  131. +check_asc_jis(buf, nbytes, ubuf, ulen)
  132. + const unsigned char *buf;
  133. + size_t nbytes;
  134. + unichar *ubuf;
  135. + size_t *ulen;
  136. +{
  137. + size_t i;
  138. + int jflag;
  139. +
  140. + *ulen = 0; jflag=0;
  141. +
  142. + for (i = 0; i < nbytes; i++) {
  143. + int t = jp_chars1[buf[i]];
  144. +
  145. + if (t != T && t != J )
  146. + return 0;
  147. +
  148. + if (buf[i] == ESC && i+2<nbytes) {
  149. + if ((buf[i+1]=='$' && buf[i+2]=='B')||
  150. + (buf[i+1]=='$' && buf[i+2]=='@')) jflag=1;
  151. + }
  152. +
  153. + ubuf[(*ulen)++] = buf[i];
  154. + }
  155. +
  156. + if (jflag==1) return JIS;
  157. + else return ASCII;
  158. +}
  159. +
  160. +int
  161. +check_sjis(buf, nbytes, ubuf, ulen)
  162. + const unsigned char *buf;
  163. + size_t nbytes;
  164. + unichar *ubuf;
  165. + size_t *ulen;
  166. +{
  167. + size_t i;
  168. + int jflag;
  169. +
  170. + *ulen = 0;
  171. + jflag = ASCII;
  172. + for (i = 0; i < nbytes; i++) {
  173. + int t = jp_chars1[buf[i]];
  174. +
  175. + if (t != T && t != J && t != S && t!= O)
  176. + return 0;
  177. +
  178. + if (t == S && i<nbytes-1){
  179. + ubuf[(*ulen)++] = buf[i];
  180. + ++i;
  181. + t=jp_chars2[buf[i]];
  182. + if(t != S && t != O ) return 0;
  183. + jflag=SJIS;
  184. + } else if (t == O && i<nbytes-1){
  185. + ubuf[(*ulen)++] = buf[i];
  186. + ++i;
  187. + t=jp_chars2[buf[i]];
  188. + if( t == S ){ jflag=SJIS; }
  189. + else if( t == O ){ if(jflag==ASCII) jflag=EUCORSJIS; }
  190. + else return 0;
  191. + }
  192. +
  193. + ubuf[(*ulen)++] = buf[i];
  194. + }
  195. +#ifdef Z
  196. + if (jflag==SJIS) {ckfputs("SJIS text", stdout); return SJIS;}
  197. + if (jflag==EUCORSJIS) {ckfputs("EUCorSJIS", stdout); return EUCORSJIS;}
  198. +#else
  199. + if (jflag==SJIS) {return SJIS;}
  200. + if (jflag==EUCORSJIS) {return EUCORSJIS;}
  201. +#endif
  202. +}
  203. +
  204. +int
  205. +check_euc(buf, nbytes, ubuf, ulen)
  206. + const unsigned char *buf;
  207. + size_t nbytes;
  208. + unichar *ubuf;
  209. + size_t *ulen;
  210. +{
  211. + size_t i;
  212. + int jflag;
  213. +
  214. + *ulen = 0;
  215. + jflag = ASCII;
  216. +
  217. + for (i = 0; i < nbytes; i++) {
  218. + int t = jp_chars1[buf[i]];
  219. +
  220. + if (t != T && t != J && t != E && t!= O)
  221. + return 0;
  222. +
  223. + if (t == E && i<nbytes-1){
  224. + ubuf[(*ulen)++] = buf[i];
  225. + ++i;
  226. + t= jp_chars2[buf[i]];
  227. + if( t != E && t != O) return 0;
  228. + jflag=EUC;
  229. + } else if (t == O && i<nbytes-1){
  230. + ubuf[(*ulen)++] = buf[i];
  231. + ++i;
  232. + t=jp_chars2[buf[i]];
  233. + if( t == E ){ jflag=EUC; }
  234. + else if( t == O ){ if(jflag==ASCII) jflag=EUCORSJIS; }
  235. + else return 0;
  236. + }
  237. +
  238. + ubuf[(*ulen)++] = buf[i];
  239. + }
  240. +#ifdef Z
  241. + if (jflag==EUC) {ckfputs("EUC text", stdout); return EUC;}
  242. + if (jflag==EUCORSJIS) {ckfputs("EUCorSJIS", stdout); return EUCORSJIS;}
  243. +#else
  244. + if (jflag==EUC) { return EUC;}
  245. + if (jflag==EUCORSJIS) {return EUCORSJIS;}
  246. +#endif
  247. +}
  248. +
  249. +int
  250. +detect_kcode(buf, nbytes, ubuf, ulen)
  251. + const unsigned char *buf;
  252. + size_t nbytes;
  253. + unichar *ubuf;
  254. + size_t *ulen;
  255. +{
  256. + int ret;
  257. + ret=check_asc_jis(buf, nbytes, ubuf, ulen);
  258. + if(ret==ASCII) return ASCII;
  259. + if(ret==JIS) return JIS;
  260. +
  261. + ret=check_sjis(buf, nbytes, ubuf, ulen);
  262. + if(ret==SJIS) return SJIS;
  263. + if(ret==EUCORSJIS) return EUCORSJIS;
  264. + ret=check_euc(buf, nbytes, ubuf, ulen);
  265. + if(ret==EUC) return EUC;
  266. + if(ret==EUCORSJIS) return EUCORSJIS;
  267. +}
  268. --- /dev/null 2011-02-06 21:11:58.373999997 +0900
  269. +++ file-5.05/src/jcode.h 2011-02-11 17:12:11.000000000 +0900
  270. @@ -0,0 +1,15 @@
  271. +/*
  272. + jcode.h - for jcode.c by Jun Nishii <jun@vinelinux.org>
  273. + modified by Ryoichi INAGAKI <inagaki@vinelinux.org>
  274. + */
  275. +
  276. +#define ASCII 1
  277. +#define JIS 2
  278. +#define EUC 3
  279. +#define SJIS 4
  280. +#define EUCORSJIS 5
  281. +
  282. +extern int detect_kcode (const unsigned char *, size_t, unichar *, size_t *);
  283. +extern int looks_jis (const unsigned char *, size_t, unichar *, size_t *);
  284. +extern int looks_sjis (const unsigned char *, size_t, unichar *, size_t *);
  285. +extern int looks_euc (const unsigned char *, size_t, unichar *, size_t *);
  286. --- file-5.05/src/names.h.vinejtext 2010-10-09 06:58:44.000000000 +0900
  287. +++ file-5.05/src/names.h 2011-02-11 17:28:18.000000000 +0900
  288. @@ -135,8 +135,6 @@
  289. {"/*", L_C, 2 }, /* must precede "The", "the", etc. */
  290. {"#include", L_C, 2 },
  291. {"char", L_C, 2 },
  292. - {"The", L_ENG, 2 },
  293. - {"the", L_ENG, 2 },
  294. {"double", L_C, 1 },
  295. {"extern", L_C, 2 },
  296. {"float", L_C, 1 },