crc32_s390x.s 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build s390x
  5. #include "textflag.h"
  6. // Vector register range containing CRC-32 constants
  7. #define CONST_PERM_LE2BE V9
  8. #define CONST_R2R1 V10
  9. #define CONST_R4R3 V11
  10. #define CONST_R5 V12
  11. #define CONST_RU_POLY V13
  12. #define CONST_CRC_POLY V14
  13. // The CRC-32 constant block contains reduction constants to fold and
  14. // process particular chunks of the input data stream in parallel.
  15. //
  16. // Note that the constant definitions below are extended in order to compute
  17. // intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
  18. // The rightmost doubleword can be 0 to prevent contribution to the result or
  19. // can be multiplied by 1 to perform an XOR without the need for a separate
  20. // VECTOR EXCLUSIVE OR instruction.
  21. //
  22. // The polynomials used are bit-reflected:
  23. //
  24. // IEEE: P'(x) = 0x0edb88320
  25. // Castagnoli: P'(x) = 0x082f63b78
  26. // IEEE polynomial constants
  27. DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
  28. DATA ·crcleconskp+8(SB)/8, $0x0706050403020100
  29. DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2
  30. DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1
  31. DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4
  32. DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3
  33. DATA ·crcleconskp+48(SB)/8, $0x0000000000000000
  34. DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5
  35. DATA ·crcleconskp+64(SB)/8, $0x0000000000000000
  36. DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u'
  37. DATA ·crcleconskp+80(SB)/8, $0x0000000000000000
  38. DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1
  39. GLOBL ·crcleconskp(SB), RODATA, $144
  40. // Castagonli Polynomial constants
  41. DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
  42. DATA ·crccleconskp+8(SB)/8, $0x0706050403020100
  43. DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2
  44. DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1
  45. DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4
  46. DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3
  47. DATA ·crccleconskp+48(SB)/8, $0x0000000000000000
  48. DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5
  49. DATA ·crccleconskp+64(SB)/8, $0x0000000000000000
  50. DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u'
  51. DATA ·crccleconskp+80(SB)/8, $0x0000000000000000
  52. DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1
  53. GLOBL ·crccleconskp(SB), RODATA, $144
  54. // func hasVectorFacility() bool
  55. TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
  56. MOVD $x-24(SP), R1
  57. XC $24, 0(R1), 0(R1) // clear the storage
  58. MOVD $2, R0 // R0 is the number of double words stored -1
  59. WORD $0xB2B01000 // STFLE 0(R1)
  60. XOR R0, R0 // reset the value of R0
  61. MOVBZ z-8(SP), R1
  62. AND $0x40, R1
  63. BEQ novector
  64. vectorinstalled:
  65. // check if the vector instruction has been enabled
  66. VLEIB $0, $0xF, V16
  67. VLGVB $0, V16, R1
  68. CMPBNE R1, $0xF, novector
  69. MOVB $1, ret+0(FP) // have vx
  70. RET
  71. novector:
  72. MOVB $0, ret+0(FP) // no vx
  73. RET
  74. // The CRC-32 function(s) use these calling conventions:
  75. //
  76. // Parameters:
  77. //
  78. // R2: Initial CRC value, typically ~0; and final CRC (return) value.
  79. // R3: Input buffer pointer, performance might be improved if the
  80. // buffer is on a doubleword boundary.
  81. // R4: Length of the buffer, must be 64 bytes or greater.
  82. //
  83. // Register usage:
  84. //
  85. // R5: CRC-32 constant pool base pointer.
  86. // V0: Initial CRC value and intermediate constants and results.
  87. // V1..V4: Data for CRC computation.
  88. // V5..V8: Next data chunks that are fetched from the input buffer.
  89. //
  90. // V9..V14: CRC-32 constants.
  91. // func vectorizedIEEE(crc uint32, p []byte) uint32
  92. TEXT ·vectorizedIEEE(SB), NOSPLIT, $0
  93. MOVWZ crc+0(FP), R2 // R2 stores the CRC value
  94. MOVD p+8(FP), R3 // data pointer
  95. MOVD p_len+16(FP), R4 // len(p)
  96. MOVD $·crcleconskp(SB), R5
  97. BR vectorizedBody<>(SB)
  98. // func vectorizedCastagnoli(crc uint32, p []byte) uint32
  99. TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0
  100. MOVWZ crc+0(FP), R2 // R2 stores the CRC value
  101. MOVD p+8(FP), R3 // data pointer
  102. MOVD p_len+16(FP), R4 // len(p)
  103. // R5: crc-32 constant pool base pointer, constant is used to reduce crc
  104. MOVD $·crccleconskp(SB), R5
  105. BR vectorizedBody<>(SB)
  106. TEXT vectorizedBody<>(SB), NOSPLIT, $0
  107. XOR $0xffffffff, R2 // NOTW R2
  108. VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
  109. // Load the initial CRC value into the rightmost word of V0
  110. VZERO V0
  111. VLVGF $3, R2, V0
  112. // Crash if the input size is less than 64-bytes.
  113. CMP R4, $64
  114. BLT crash
  115. // Load a 64-byte data chunk and XOR with CRC
  116. VLM 0(R3), V1, V4 // 64-bytes into V1..V4
  117. // Reflect the data if the CRC operation is in the bit-reflected domain
  118. VPERM V1, V1, CONST_PERM_LE2BE, V1
  119. VPERM V2, V2, CONST_PERM_LE2BE, V2
  120. VPERM V3, V3, CONST_PERM_LE2BE, V3
  121. VPERM V4, V4, CONST_PERM_LE2BE, V4
  122. VX V0, V1, V1 // V1 ^= CRC
  123. ADD $64, R3 // BUF = BUF + 64
  124. ADD $(-64), R4
  125. // Check remaining buffer size and jump to proper folding method
  126. CMP R4, $64
  127. BLT less_than_64bytes
  128. fold_64bytes_loop:
  129. // Load the next 64-byte data chunk into V5 to V8
  130. VLM 0(R3), V5, V8
  131. VPERM V5, V5, CONST_PERM_LE2BE, V5
  132. VPERM V6, V6, CONST_PERM_LE2BE, V6
  133. VPERM V7, V7, CONST_PERM_LE2BE, V7
  134. VPERM V8, V8, CONST_PERM_LE2BE, V8
  135. // Perform a GF(2) multiplication of the doublewords in V1 with
  136. // the reduction constants in V0. The intermediate result is
  137. // then folded (accumulated) with the next data chunk in V5 and
  138. // stored in V1. Repeat this step for the register contents
  139. // in V2, V3, and V4 respectively.
  140. VGFMAG CONST_R2R1, V1, V5, V1
  141. VGFMAG CONST_R2R1, V2, V6, V2
  142. VGFMAG CONST_R2R1, V3, V7, V3
  143. VGFMAG CONST_R2R1, V4, V8, V4
  144. // Adjust buffer pointer and length for next loop
  145. ADD $64, R3 // BUF = BUF + 64
  146. ADD $(-64), R4 // LEN = LEN - 64
  147. CMP R4, $64
  148. BGE fold_64bytes_loop
  149. less_than_64bytes:
  150. // Fold V1 to V4 into a single 128-bit value in V1
  151. VGFMAG CONST_R4R3, V1, V2, V1
  152. VGFMAG CONST_R4R3, V1, V3, V1
  153. VGFMAG CONST_R4R3, V1, V4, V1
  154. // Check whether to continue with 64-bit folding
  155. CMP R4, $16
  156. BLT final_fold
  157. fold_16bytes_loop:
  158. VL 0(R3), V2 // Load next data chunk
  159. VPERM V2, V2, CONST_PERM_LE2BE, V2
  160. VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk
  161. // Adjust buffer pointer and size for folding next data chunk
  162. ADD $16, R3
  163. ADD $-16, R4
  164. // Process remaining data chunks
  165. CMP R4, $16
  166. BGE fold_16bytes_loop
  167. final_fold:
  168. VLEIB $7, $0x40, V9
  169. VSRLB V9, CONST_R4R3, V0
  170. VLEIG $0, $1, V0
  171. VGFMG V0, V1, V1
  172. VLEIB $7, $0x20, V9 // Shift by words
  173. VSRLB V9, V1, V2 // Store remaining bits in V2
  174. VUPLLF V1, V1 // Split rightmost doubleword
  175. VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2
  176. // The input values to the Barret reduction are the degree-63 polynomial
  177. // in V1 (R(x)), degree-32 generator polynomial, and the reduction
  178. // constant u. The Barret reduction result is the CRC value of R(x) mod
  179. // P(x).
  180. //
  181. // The Barret reduction algorithm is defined as:
  182. //
  183. // 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
  184. // 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
  185. // 3. C(x) = R(x) XOR T2(x) mod x^32
  186. //
  187. // Note: To compensate the division by x^32, use the vector unpack
  188. // instruction to move the leftmost word into the leftmost doubleword
  189. // of the vector register. The rightmost doubleword is multiplied
  190. // with zero to not contribute to the intermedate results.
  191. // T1(x) = floor( R(x) / x^32 ) GF2MUL u
  192. VUPLLF V1, V2
  193. VGFMG CONST_RU_POLY, V2, V2
  194. // Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
  195. // V2 and XOR the intermediate result, T2(x), with the value in V1.
  196. // The final result is in the rightmost word of V2.
  197. VUPLLF V2, V2
  198. VGFMAG CONST_CRC_POLY, V2, V1, V2
  199. done:
  200. VLGVF $2, V2, R2
  201. XOR $0xffffffff, R2 // NOTW R2
  202. MOVWZ R2, ret + 32(FP)
  203. RET
  204. crash:
  205. MOVD $0, (R0) // input size is less than 64-bytes