crc32_amd64.s 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. //+build !noasm
  2. //+build !appengine
  3. // Copyright 2015, Klaus Post, see LICENSE for details.
  4. // func crc32sse(a []byte) uint32
  5. TEXT ·crc32sse(SB), 4, $0
  6. MOVQ a+0(FP), R10
  7. XORQ BX, BX
  8. // CRC32 dword (R10), EBX
  9. BYTE $0xF2; BYTE $0x41; BYTE $0x0f
  10. BYTE $0x38; BYTE $0xf1; BYTE $0x1a
  11. MOVL BX, ret+24(FP)
  12. RET
  13. // func crc32sseAll(a []byte, dst []uint32)
  14. TEXT ·crc32sseAll(SB), 4, $0
  15. MOVQ a+0(FP), R8 // R8: src
  16. MOVQ a_len+8(FP), R10 // input length
  17. MOVQ dst+24(FP), R9 // R9: dst
  18. SUBQ $4, R10
  19. JS end
  20. JZ one_crc
  21. MOVQ R10, R13
  22. SHRQ $2, R10 // len/4
  23. ANDQ $3, R13 // len&3
  24. XORQ BX, BX
  25. ADDQ $1, R13
  26. TESTQ R10, R10
  27. JZ rem_loop
  28. crc_loop:
  29. MOVQ (R8), R11
  30. XORQ BX, BX
  31. XORQ DX, DX
  32. XORQ DI, DI
  33. MOVQ R11, R12
  34. SHRQ $8, R11
  35. MOVQ R12, AX
  36. MOVQ R11, CX
  37. SHRQ $16, R12
  38. SHRQ $16, R11
  39. MOVQ R12, SI
  40. // CRC32 EAX, EBX
  41. BYTE $0xF2; BYTE $0x0f
  42. BYTE $0x38; BYTE $0xf1; BYTE $0xd8
  43. // CRC32 ECX, EDX
  44. BYTE $0xF2; BYTE $0x0f
  45. BYTE $0x38; BYTE $0xf1; BYTE $0xd1
  46. // CRC32 ESI, EDI
  47. BYTE $0xF2; BYTE $0x0f
  48. BYTE $0x38; BYTE $0xf1; BYTE $0xfe
  49. MOVL BX, (R9)
  50. MOVL DX, 4(R9)
  51. MOVL DI, 8(R9)
  52. XORQ BX, BX
  53. MOVL R11, AX
  54. // CRC32 EAX, EBX
  55. BYTE $0xF2; BYTE $0x0f
  56. BYTE $0x38; BYTE $0xf1; BYTE $0xd8
  57. MOVL BX, 12(R9)
  58. ADDQ $16, R9
  59. ADDQ $4, R8
  60. XORQ BX, BX
  61. SUBQ $1, R10
  62. JNZ crc_loop
  63. rem_loop:
  64. MOVL (R8), AX
  65. // CRC32 EAX, EBX
  66. BYTE $0xF2; BYTE $0x0f
  67. BYTE $0x38; BYTE $0xf1; BYTE $0xd8
  68. MOVL BX, (R9)
  69. ADDQ $4, R9
  70. ADDQ $1, R8
  71. XORQ BX, BX
  72. SUBQ $1, R13
  73. JNZ rem_loop
  74. end:
  75. RET
  76. one_crc:
  77. MOVQ $1, R13
  78. XORQ BX, BX
  79. JMP rem_loop
  80. // func matchLenSSE4(a, b []byte, max int) int
  81. TEXT ·matchLenSSE4(SB), 4, $0
  82. MOVQ a_base+0(FP), SI
  83. MOVQ b_base+24(FP), DI
  84. MOVQ DI, DX
  85. MOVQ max+48(FP), CX
  86. cmp8:
  87. // As long as we are 8 or more bytes before the end of max, we can load and
  88. // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
  89. CMPQ CX, $8
  90. JLT cmp1
  91. MOVQ (SI), AX
  92. MOVQ (DI), BX
  93. CMPQ AX, BX
  94. JNE bsf
  95. ADDQ $8, SI
  96. ADDQ $8, DI
  97. SUBQ $8, CX
  98. JMP cmp8
  99. bsf:
  100. // If those 8 bytes were not equal, XOR the two 8 byte values, and return
  101. // the index of the first byte that differs. The BSF instruction finds the
  102. // least significant 1 bit, the amd64 architecture is little-endian, and
  103. // the shift by 3 converts a bit index to a byte index.
  104. XORQ AX, BX
  105. BSFQ BX, BX
  106. SHRQ $3, BX
  107. ADDQ BX, DI
  108. // Subtract off &b[0] to convert from &b[ret] to ret, and return.
  109. SUBQ DX, DI
  110. MOVQ DI, ret+56(FP)
  111. RET
  112. cmp1:
  113. // In the slices' tail, compare 1 byte at a time.
  114. CMPQ CX, $0
  115. JEQ matchLenEnd
  116. MOVB (SI), AX
  117. MOVB (DI), BX
  118. CMPB AX, BX
  119. JNE matchLenEnd
  120. ADDQ $1, SI
  121. ADDQ $1, DI
  122. SUBQ $1, CX
  123. JMP cmp1
  124. matchLenEnd:
  125. // Subtract off &b[0] to convert from &b[ret] to ret, and return.
  126. SUBQ DX, DI
  127. MOVQ DI, ret+56(FP)
  128. RET
  129. // func histogram(b []byte, h []int32)
  130. TEXT ·histogram(SB), 4, $0
  131. MOVQ b+0(FP), SI // SI: &b
  132. MOVQ b_len+8(FP), R9 // R9: len(b)
  133. MOVQ h+24(FP), DI // DI: Histogram
  134. MOVQ R9, R8
  135. SHRQ $3, R8
  136. JZ hist1
  137. XORQ R11, R11
  138. loop_hist8:
  139. MOVQ (SI), R10
  140. MOVB R10, R11
  141. INCL (DI)(R11*4)
  142. SHRQ $8, R10
  143. MOVB R10, R11
  144. INCL (DI)(R11*4)
  145. SHRQ $8, R10
  146. MOVB R10, R11
  147. INCL (DI)(R11*4)
  148. SHRQ $8, R10
  149. MOVB R10, R11
  150. INCL (DI)(R11*4)
  151. SHRQ $8, R10
  152. MOVB R10, R11
  153. INCL (DI)(R11*4)
  154. SHRQ $8, R10
  155. MOVB R10, R11
  156. INCL (DI)(R11*4)
  157. SHRQ $8, R10
  158. MOVB R10, R11
  159. INCL (DI)(R11*4)
  160. SHRQ $8, R10
  161. INCL (DI)(R10*4)
  162. ADDQ $8, SI
  163. DECQ R8
  164. JNZ loop_hist8
  165. hist1:
  166. ANDQ $7, R9
  167. JZ end_hist
  168. XORQ R10, R10
  169. loop_hist1:
  170. MOVB (SI), R10
  171. INCL (DI)(R10*4)
  172. INCQ SI
  173. DECQ R9
  174. JNZ loop_hist1
  175. end_hist:
  176. RET