作者:王智通
这两天在class文件解析器的基础上, 加上了java反汇编的功能, 反汇编器是指令解释器的基础,通过编写反汇编器可以熟悉jvm的指令系统, 不过jvm的指令一共有201个,反汇编过程基本就是个体力活。在《java虚拟机规范》中对每一条指令都有了详细的描述,下面说说我是如何解析bytecode的:
一个java文件经过javac编译后会生成class格式文件, 在class格式中method字段里会有Code属性,Code属性包含了java的指令码和长度。 首先用class解析器将指令码提取出来, 举个例子:
test.java
class aa { int a = 6; }; public class test { public static void main(String args[]) { int i = 0; for (i = 0; i < 5; i++) System.out.println("hehe"); } }
我们用class文件解析器把test对应的bytecode打印出来:
len: 5
0x2a0xb70x00x10xb1
这一串bytecode为:0x2a0xb70x00x10xb1, 长度是5个字节。
对照《java虚拟机规范》我们来一步步手工解析:
0x2a代表aload_0指令, 它将本地局部变量中的第一个变量压入到堆栈里。这个指令本身长度就是一个字节,没有参数, 因此0x2a的解析就非常简单, 直接在屏幕打印出aload_0即可:
printf(“%s\n”, symbol);
0xb7代表invokespecial 它用来调用超类构造方法,实例初始化方法, 私有方法。它的用法如下:
invokespecial indexbyte1 indexbyte2,indexbyte1和indexbyte2各占一个字节,用(indexbyte1 << 8) | indexbyte2来构建一个常量池中的索引。每个jvm指令本身都占用一个字节,加上它的两个参数, invokespecial语句它将占用3个字节空间。 所以它的解析算法如下:
u2 index; index = ((*(u1 *)(base + 1)) << 8) | (*(u1 *)(base + 2)); printf("%s #%x\n", symbol, index);
注意0xb7解析完后,我们要跳过3个字节的地址,那么就是0xb1了, 它是return指令,没有参数,因此它的解析方法跟aload_0一样:
printf(“%s\n”, symbol);
以上是我们手工解析的过程, 但是jvm有201条指令, 我们需要建立一个合适的数据结构:
typedef int (*interp_func)(u2 opcode_len, char *symbol, void *base); typedef struct bytecode_st { u2 opcode; // jvm的指令码 u2 opcode_len; // 指令总的长度,包括参数 char symbol[OPCODE_SYMBOL_LEN]; // 指令对应的助记符 interp_func func; // 解析指令的回调函数 }BYTECODE;
我们可以直接建立一个大的BYTECODE数组:
BYTECODE jvm_byte_code[OPCODE_LEN] = { {0x00, 1, "nop", jvm_interp_nop}, {0x01, 1, "aconst_null", jvm_interp_aconst_null}, {0x02, 1, "iconst_m1", jvm_interp_iconst_m1}, {0x03, 1, "iconst_0", jvm_interp_iconst_0}, {0x04, 1, "iconst_1", jvm_interp_iconst_1}, {0x05, 1, "iconst_2", jvm_interp_iconst_2}, {0x06, 1, "iconst_3", jvm_interp_iconst_3}, {0x07, 1, "iconst_4", jvm_interp_iconst_4}, {0x08, 1, "iconst_5", jvm_interp_iconst_5}, {0x09, 1, "lconst_0", jvm_interp_lconst_0}, {0x0a, 1, "lconst_1", jvm_interp_lconst_1}, {0x0b, 1, "fconst_0", jvm_interp_fconst_0}, {0x0c, 1, "fconst_1", jvm_interp_fconst_1}, {0x0d, 1, "fconst_2", jvm_interp_fconst_2}, {0x0e, 1, "dconst_0", jvm_interp_dconst_0}, {0x0f, 1, "dconst_1", jvm_interp_dconst_1}, {0x10, 1, "bipush", jvm_interp_bipush}, {0x11, 1, "sipush", jvm_interp_sipush}, {0x12, 2, "ldc", jvm_interp_ldc}, {0x13, 1, "ldc_w", jvm_interp_ldc_w}, {0x14, 1, "ldc2_w", jvm_interp_ldc2_w}, {0x15, 1, "iload", jvm_interp_iload}, {0x16, 1, "lload", jvm_interp_lload}, {0x17, 1, "fload", jvm_interp_fload}, {0x18, 1, "dload", jvm_interp_dload}, {0x19, 1, "aload", jvm_interp_aload}, {0x1a, 1, "iload_0", jvm_interp_iload_0}, {0x1b, 1, "iload_1", jvm_interp_iload_1}, {0x1c, 1, "iload_2", jvm_interp_iload_2}, {0x1d, 1, "iload_3", jvm_interp_iload_3}, {0x1e, 1, "lload_0", jvm_interp_lload_0}, {0x1f, 1, "lload_1", jvm_interp_lload_1}, {0x20, 1, "lload_2", jvm_interp_lload_2}, {0x21, 1, "lload_3", jvm_interp_lload_3}, {0x22, 1, "fload_0", jvm_interp_fload_0}, {0x23, 1, "fload_1", jvm_interp_fload_1}, {0x24, 1, "fload_2", jvm_interp_fload_2}, {0x25, 1, "fload_3", jvm_interp_fload_3}, {0x26, 1, "dload_0", jvm_interp_dload_0}, {0x27, 1, "dload_1", jvm_interp_dload_1}, {0x28, 1, "dload_2", jvm_interp_dload_2}, {0x29, 1, "dload_3", jvm_interp_dload_3}, {0x2a, 1, "aload_0", jvm_interp_aload_0}, {0x2b, 1, "aload_1", jvm_interp_aload_1}, {0x2c, 1, "aload_2", jvm_interp_aload_2}, {0x2d, 1, "aload_3", jvm_interp_aload_3}, {0x2e, 1, "iaload", jvm_interp_iaload}, {0x2f, 1, "laload", jvm_interp_laload}, {0x30, 1, "faload", jvm_interp_faload}, {0x31, 1, "daload", jvm_interp_daload}, {0x32, 1, "aaload", jvm_interp_aaload}, {0x33, 1, "baload", jvm_interp_baload}, {0x34, 1, "caload", jvm_interp_caload}, {0x35, 1, "saload", jvm_interp_saload}, {0x36, 1, "istore", jvm_interp_istore}, {0x37, 1, "lstore", jvm_interp_lstore}, {0x38, 1, "fstore", jvm_interp_fstore}, {0x39, 1, "dstore", jvm_interp_dstore}, {0x3a, 1, "astore", jvm_interp_astore}, {0x3b, 1, "istore_0", jvm_interp_istore_0}, {0x3c, 1, "istore_1", jvm_interp_istore_1}, {0x3d, 1, "istore_2", jvm_interp_istore_2}, {0x3e, 1, "istore_3", jvm_interp_istore_3}, {0x3f, 1, "lstore_0", jvm_interp_lstore_0}, {0x40, 1, "lstore_1", jvm_interp_lstore_1}, {0x41, 1, "lstore_2", jvm_interp_lstore_2}, {0x42, 1, "lstore_3", jvm_interp_lstore_3}, {0x43, 1, "fstore_0", jvm_interp_fstore_0}, {0x44, 1, "fstore_1", jvm_interp_fstore_1}, {0x45, 1, "fstore_2", jvm_interp_fstore_2}, {0x46, 1, "fstore_3", jvm_interp_fstore_3}, {0x47, 1, "dstore_0", jvm_interp_dstore_0}, {0x48, 1, "dstore_1", jvm_interp_dstore_1}, {0x49, 1, "dstore_2", jvm_interp_dstore_2}, {0x4a, 1, "dstore_3", jvm_interp_dstore_3}, {0x4b, 1, "astore_0", jvm_interp_astore_0}, {0x4c, 1, "astore_1", jvm_interp_astore_1}, {0x4d, 1, "astore_2", jvm_interp_astore_2}, {0x4e, 1, "astore_3", jvm_interp_astore_3}, {0x4f, 1, "iastore", jvm_interp_iastore}, {0x50, 1, "lastore", jvm_interp_lastore}, {0x51, 1, "fastore", jvm_interp_fastore}, {0x52, 1, "dastore", jvm_interp_dastore}, {0x53, 1, "aastore", jvm_interp_aastore}, {0x54, 1, "bastore", jvm_interp_bastore}, {0x55, 1, "castore", jvm_interp_castore}, {0x56, 1, "sastore", jvm_interp_sastore}, {0x57, 1, "pop", jvm_interp_pop}, {0x58, 1, "pop2", jvm_interp_pop2}, {0x59, 1, "dup", jvm_interp_dup}, {0x5a, 1, "dup_x1", jvm_interp_dup_x1}, {0x5b, 1, "dup_x2", jvm_interp_dup_x2}, {0x5c, 1, "dup2", jvm_interp_dup2}, {0x5d, 1, "dup2_x1", jvm_interp_dup2_x1}, {0x5e, 1, "dup2_x2", jvm_interp_dup2_x2}, {0x5f, 1, "swap", jvm_interp_swap}, {0x60, 1, "iadd", jvm_interp_iadd}, {0x61, 1, "ladd", jvm_interp_ladd}, {0x62, 1, "fadd", jvm_interp_fadd}, {0x63, 1, "dadd", jvm_interp_dadd}, {0x64, 1, "isub", jvm_interp_isub}, {0x65, 1, "lsub", jvm_interp_lsub}, {0x66, 1, "fsub", jvm_interp_fsub}, {0x67, 1, "dsub", jvm_interp_dsub}, {0x68, 1, "imul", jvm_interp_imul}, {0x69, 1, "lmul", jvm_interp_lmul}, {0x6a, 1, "fmul", jvm_interp_fmul}, {0x6b, 1, "dmul", jvm_interp_dmul}, {0x6c, 1, "idiv", jvm_interp_idiv}, {0x6d, 1, "ldiv", jvm_interp_ldiv}, {0x6e, 1, "fdiv", jvm_interp_fdiv}, {0x6f, 1, "ddiv", jvm_interp_ddiv}, {0x70, 1, "irem", jvm_interp_irem}, {0x71, 1, "lrem", jvm_interp_lrem}, {0x72, 1, "frem", jvm_interp_frem}, {0x73, 1, "drem", jvm_interp_drem}, {0x74, 1, "ineg", jvm_interp_ineg}, {0x75, 1, "lneg", jvm_interp_lneg}, {0x76, 1, "fneg", jvm_interp_fneg}, {0x77, 1, "dneg", jvm_interp_dneg}, {0x78, 1, "ishl", jvm_interp_ishl}, {0x79, 1, "lshl", jvm_interp_lshl}, {0x7a, 1, "ishr", jvm_interp_ishr}, {0x7b, 1, "lshr", jvm_interp_lshr}, {0x7c, 1, "iushr", jvm_interp_iushr}, {0x7d, 1, "lushr", jvm_interp_lushr}, {0x7e, 1, "iand", jvm_interp_iand}, {0x7f, 1, "land", jvm_interp_land}, {0x80, 1, "ior", jvm_interp_ior}, {0x81, 1, "lor", jvm_interp_lor}, {0x82, 1, "ixor", jvm_interp_ixor}, {0x83, 1, "lxor", jvm_interp_lxor}, {0x84, 3, "iinc", jvm_interp_iinc}, {0x85, 1, "i2l", jvm_interp_i2l}, {0x86, 1, "i2f", jvm_interp_i2f}, {0x87, 1, "i2d", jvm_interp_i2d}, {0x88, 1, "l2i", jvm_interp_l2i}, {0x89, 1, "l2f", jvm_interp_l2f}, {0x8a, 1, "l2d", jvm_interp_l2d}, {0x8b, 1, "f2i", jvm_interp_f2i}, {0x8c, 1, "f2l", jvm_interp_f2l}, {0x8d, 1, "f2d", jvm_interp_f2d}, {0x8e, 1, "d2i", jvm_interp_d2i}, {0x8f, 1, "d2l", jvm_interp_d2l}, {0x90, 1, "d2f", jvm_interp_d2f}, {0x91, 1, "i2b", jvm_interp_i2b}, {0x92, 1, "i2c", jvm_interp_i2c}, {0x93, 1, "i2s", jvm_interp_i2s}, {0x94, 1, "lcmp", jvm_interp_lcmp}, {0x95, 1, "fcmpl", jvm_interp_fcmpl}, {0x96, 1, "fcmpg", jvm_interp_fcmpg}, {0x97, 1, "dcmpl", jvm_interp_dcmpl}, {0x98, 1, "dcmpg", jvm_interp_dcmpg}, {0x99, 1, "ifeq", jvm_interp_ifeq}, {0x9a, 1, "ifne", jvm_interp_ifne}, {0x9b, 1, "iflt", jvm_interp_iflt}, {0x9c, 1, "ifge", jvm_interp_ifge}, {0x9d, 1, "ifgt", jvm_interp_ifgt}, {0x9e, 1, "ifle", jvm_interp_ifle}, {0x9f, 1, "if_icmpeq", jvm_interp_if_icmpeq}, {0xa0, 1, "if_icmpne", jvm_interp_if_icmpne}, {0xa1, 1, "if_icmplt", jvm_interp_if_icmplt}, {0xa2, 3, "if_icmpge", jvm_interp_if_icmpge}, {0xa3, 1, "if_icmpgt", jvm_interp_if_icmpgt}, {0xa4, 1, "if_icmple", jvm_interp_if_icmple}, {0xa5, 1, "if_acmpeq", jvm_interp_if_acmpeq}, {0xa6, 1, "if_acmpne", jvm_interp_if_acmpne}, {0xa7, 3, "goto", jvm_interp_goto}, {0xa8, 1, "jsr", jvm_interp_jsr}, {0xa9, 1, "ret", jvm_interp_ret}, {0xaa, 1, "tableswitch", jvm_interp_tableswitch}, {0xab, 1, "lookupswitch", jvm_interp_lookupswitch}, {0xac, 1, "ireturn", jvm_interp_ireturn}, {0xad, 1, "lreturn", jvm_interp_lreturn}, {0xae, 1, "freturn", jvm_interp_freturn}, {0xaf, 1, "dreturn", jvm_interp_dreturn}, {0xb0, 1, "areturn", jvm_interp_areturn}, {0xb1, 1, "return", jvm_interp_return}, {0xb2, 3, "getstatic", jvm_interp_getstatic}, {0xb3, 1, "putstatic", jvm_interp_putstatic}, {0xb4, 1, "getfield", jvm_interp_getfield}, {0xb5, 1, "putfield", jvm_interp_putfield}, {0xb6, 3, "invokevirtual", jvm_interp_invokevirtual}, {0xb7, 3, "invokespecial", jvm_interp_invokespecial}, {0xb8, 1, "invokestatic", jvm_interp_invokestatic}, {0xb9, 1, "invokeinterface", jvm_interp_invokeinterface}, {0xba, 1, "invokedynamic", jvm_interp_invokedynamic}, {0xbb, 1, "new", jvm_interp_new}, {0xbc, 1, "newarray", jvm_interp_newarray}, {0xbd, 1, "anewarray", jvm_interp_anewarray}, {0xbe, 1, "arraylength", jvm_interp_arraylength}, {0xbf, 1, "athrow", jvm_interp_athrow}, {0xc0, 1, "checkcast", jvm_interp_checkcast}, {0xc1, 1, "instanceof", jvm_interp_instanceof}, {0xc2, 1, "monitorenter", jvm_interp_monitorenter}, {0xc3, 1, "monitorexit", jvm_interp_monitorexit}, {0xc4, 1, "wide", jvm_interp_wide}, {0xc5, 1, "multianewarray", jvm_interp_multianewarray}, {0xc6, 1, "ifnull", jvm_interp_ifnull}, {0xc7, 1, "ifnonnull", jvm_interp_ifnonnull}, {0xc8, 1, "goto_w", jvm_interp_goto_w}, {0xc9, 1, "jsr_w", jvm_interp_jsr_w}, };
每个jvm指令的指令码就是数组的索引, 这样就能找到指令对应的BYTECODE结构,通过调用其回调函数, 就可以进入具体的解析过程了。 这样做的好处就是不用switch case一大堆分支了。
int jvm_interp_invokespecial(u2 len, char *symbol, void *base) { u2 index; index = ((*(u1 *)(base + 1)) << 8) | (*(u1 *)(base + 2)); printf("%s #%x\n", symbol, index); } int jvm_interp_aload_0(u2 len, char *symbol, void *base) { printf("%s\n", symbol); } int jvm_interp_return(u2 len, char *symbol, void *base) { printf("%s\n", symbol); } int __disass_bytecode(u1 *base, u2 len) { u1 idx = 0; u1 index; while (idx < len) { index = *(u1 *)(base + idx); //printf("!0x%x\n", index); jvm_byte_code[index].func(jvm_byte_code[index].opcode_len, jvm_byte_code[index].symbol, base + idx); idx += (u1)jvm_byte_code[index].opcode_len; } }
目前这个反汇编器只能解析一小部分指令, 随着开发的深入, 会慢慢补全的, 下面是反汇编test.class的结果:
diassember bytecode: aload_0 invokespecial #1 return ----------------------------- iconst_0 istore_1 iconst_0 istore_1 iload_1 iconst_5 if_icmpge 17 getstatic #2 ldc #3 invokevirtual #4 iinc 1 1 goto 0xfff0 return
java工具集中提供了javap, 可以反汇编java指令,本来是想山寨一个javap的, 但是现在对jvm整体结构还是不清晰,数据结构还不能很好的设计出来, 但是随着对jvm的了解深入, 反汇编器会越来越成熟。