diff --git a/specification/app_a.xml b/specification/app_a.xml index 504eb41..e5c4be6 100644 --- a/specification/app_a.xml +++ b/specification/app_a.xml @@ -24,8 +24,8 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> that compilers generate a warning or error for out-of-range literals. Vectors may be constructed from scalar values with a vector - constructor. For example: (vector type){e1, e2, ..., e - n}. The values specified for each vector element can + constructor. For example: (vector type){e1, e2, ..., en}. + The values specified for each vector element can be either a compile-time constant or a runtime expression. Floating-point vector built-in operators are controlled by the rounding mode set for floating-point operations unless otherwise @@ -309,6 +309,46 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> vector float vec_abs (vector float); + + + VEC_ABSD (ARG1, ARG2) + + + Purpose: + Computes the absolute difference. + Result value: + Each element of the result contains the absolute difference + of the corresponding input elements using modulo + arithmetic. + + + + + + + + vector unsigned char vec_absd (vector unsigned char, vector + unsigned char); + + + + + + + + vector unsigned int vec_absd (vector unsigned int, vector + unsigned int); + + + + + + + + vector unsigned short vec_absd (vector unsigned short, + vector unsigned short); + + VEC_ABSS (ARG1) @@ -759,8 +799,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. This optional function is being phased in, and it might not be available on all implementations. Phased-in interfaces are optional @@ -774,8 +813,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -785,8 +823,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -907,8 +944,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -918,8 +954,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -929,8 +964,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -1104,6 +1138,15 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> + + + + + + vector unsigned char vec_bperm (vector unsigned char, + vector unsigned char); + + @@ -1902,6 +1945,76 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> float); + + + VEC_CMPNEZ (ARG1, ARG2) + + + Purpose: + Returns a vector containing the results of comparing each + set of corresponding elements of the given vectors for inequality + or for an element with a 0 value. + Result value: + For each element of the result, the value of each bit is 1 + if the corresponding elements of ARG1 and + ARG2 are not equal, or if the ARG1 element or the ARG2 + element is 0. Otherwise, the value of each bit is 0. + + + + + + + + vector bool char vec_cmpnez (vector signed char, vector + signed char); + + + + + + + + vector bool char vec_cmpnez (vector unsigned char, vector + unsigned char); + + + + + + + + vector bool int vec_cmpnez (vector signed int, vector + signed int); + + + + + + + + vector bool int vec_cmpnez (vector unsigned int, vector + unsigned int); + + + + + + + + vector bool short vec_cmpnez (vector signed short, vector + signed short); + + + + + + + + vector bool short vec_cmpnez (vector unsigned short, vector + unsigned short); + + VEC_CNTLZ (ARG1) @@ -1919,8 +2032,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -1929,8 +2041,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -1940,8 +2051,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -1950,8 +2060,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -1960,8 +2069,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -1971,8 +2079,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -1982,8 +2089,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -1992,8 +2098,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -2001,6 +2106,151 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> short); + + + VEC_CNTLZ_LSBB (ARG1) + + + Purpose: + Returns the number of leading byte elements (starting at + the lowest-numbered element) of a vector that have a + least-significant bit of 0. + Result value: + The number of leading byte elements (starting at the + lowest-numbered element) of a vector that have a + least-significant bit of 0. + + + + + + + + signed int vec_cntlz_lsbb (vector signed char); + + + + + + + + signed int vec_cntlz_lsbb (vector unsigned char); + + + + + VEC_CNTTZ (ARG1) + + + Purpose: + Returns a vector containing the number of least-significant + bits equal to 0 of each corresponding element of the given + vector. + Result value: + The value of each element of the result is set to the + number of trailing zeros of the corresponding element of + ARG1. + + + + + + + + vector signed char vec_cnttz (vector signed char); + + + + + + + + vector unsigned char vec_cnttz (vector unsigned + char); + + + + + + + + vector signed int vec_cnttz (vector signed int); + + + + + + + + vector unsigned int vec_cnttz (vector unsigned int); + + + + + + + + vector signed long long vec_cnttz (vector signed long + long); + + + + + + + + vector unsigned long long vec_cnttz (vector unsigned long + long); + + + + + + + + vector signed short vec_cnttz (vector signed short); + + + + + + + + vector unsigned short vec_cnttz (vector unsigned + short); + + + + + VEC_CNTTZ_LSBB (ARG1) + + + Purpose: + Returns the number of trailing byte elements (starting at + the highest-numbered element) of a vector that have a + least-significant bit of 0. + Result value: + The number of trailing byte elements (starting at the + highest-numbered element) of a vector that have a + least-significant bit of 0. + + + + + + + + signed int vec_cnttz_lsbb (vector signed char); + + + + + + + + signed int vec_cnttz_lsbb (vector unsigned char); + + VEC_CPSGN(ARG1, ARG2) @@ -2018,8 +2268,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -2028,8 +2277,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -2631,54 +2879,487 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> float vec_extract (vector float, signed int); - + - VEC_FLOAT (ARG1) + + Phased in. + + + + _Float16 vec_extract (vector _Float16, signed int); + + + + + VEC_EXTRACT_EXP (ARG1) Purpose: - Converts a vector of integers to a vector of - single-precision floating-point numbers. + Extracts an exponent from a floating-point number. Result value: - Target elements are obtained by converting the respective - source elements to unsigned integers. + Each element of the returned integer vector is extracted + from the exponent field of the corresponding floating-point + vector element. + The extracted exponent of ARG1 is returned as a + right-justified unsigned integer containing a biased exponent, in + accordance with the exponent representation specified by IEEE + 754, without further processing. - + - + - vector float - vec_float (vector signed int); + vector unsigned long long vec_extract_exp (vector + double); - + - + - vector float - vec_float (vector unsigned int); + vector unsigned int vec_extract_exp (vector float); - + - VEC_FLOAT2 (ARG1, ARG2) + VEC_EXTRACT_FP32_FROM_SHORTH (ARG1) Purpose: - Converts - an input vectora vector of integers - to a vector of single-precision - numbers floating-point numbers. + Extracts four single-precision floating-point numbers from + the high elements of a vector of eight 16-bit elements, + interpreting each element as a 16-bit floating-point number in + IEEE format. Result value: - Target elements are obtained by converting the source - elements to single-precision numbers as follows: - - - Target elements 0 and 1 from source 0 - + The first four elements are interpreted as 16-bit + floating-point numbers in IEEE format, and extended to + single-precision format, returning a vector with four + single-precision IEEE numbers. + + + + + + + + vector float vec_extract_fp32_from_shorth (vector unsigned + short); + + + + + VEC_EXTRACT_FP32_FROM_SHORTL (ARG1) + + + Purpose + Extracts four single-precision floating-point numbers from + the low elements of a vector of eight 16-bit elements, + interpreting each element as a 16-bit floating-point number in + IEEE format. + Result value: + The last four elements are interpreted as 16-bit + floating-point numbers in IEEE format, and extended to + single-precision format, returning a vector with four + single-precision IEEE numbers. + + + + + + + + vector float vec_extract_fp32_from_shortl (vector unsigned + short); + + + + + VEC_EXTRACT_SIG (ARG1) + + + Purpose: + Extracts a significand (mantissa) from a floating-point + number. + Result value: + Each element of the returned integer vector is extracted + from the significand (mantissa) field of the corresponding + floating-point vector element. + The significand is from the corresponding floating-point + number in accordance with the IEEE format. The returned result + includes the implicit leading digit. The value of that digit is + not encoded in the IEEE format, but is implied by the + exponent. + + + + + + + + vector unsigned long long vec_extract_sig (vector + double) + + + + + + + + vector unsigned int vec_extract_sig (vector float) + + + + + VEC_EXTRACT4B (ARG1, ARG2) + + + Purpose: + Extracts a word from a vector at a byte position. + Result value: + The first doubleword element of the result contains the + zero-extended extracted word from ARG1. The second doubleword is + set to 0. ARG2 specifies the least-significant byte number (0 - + 12) of the word to be extracted. + + + + + + + + vector unsigned long long vec_extract4b (vector unsigned + char, const int) + + + + + VEC_FIRST_MATCH_INDEX (ARG1, ARG2) + + + Purpose: + Performs a comparison of equality on each of the + corresponding elements of ARG1 and ARG2, and returns the first + position of equality. + Result value: + Returns the element index of the position of the first + character match. If no match, returns the number of characters as + an element count in the vector argument. + + + + + + + + unsigned int vec_first_match_index (vector signed char, + vector signed char); + + + + + + + + unsigned int vec_first_match_index (vector unsigned char, + vector unsigned char); + + + + + + + + unsigned int vec_first_match_index (vector signed int, + vector signed int); + + + + + + + + unsigned int vec_first_match_index (vector unsigned int, + vector unsigned int); + + + + + + + + unsigned int vec_first_match_index (vector signed short, + vector signed short); + + + + + + + + unsigned int vec_first_match_index (vector unsigned short, + vector unsigned short); + + + + + VEC_FIRST_MATCH_OR_EOS_ INDEX (ARG1, ARG2) + + + Purpose: + Performs a comparison of equality on each of the + corresponding elements of ARG1 and ARG2. Returns the first + position of equality, or the zero string terminator. + Result value: + Returns the element index of the position of either the + first character match or an end-of-string (EOS) terminator. If no + match or terminator, returns the number of characters as an + element count in the vector argument. + + + + + + + + unsigned int vec_first_match_or_eos_index (vector signed + char, vector signed char); + + + + + + + + unsigned int vec_first_match_or_eos_index (vector unsigned + char, vector unsigned char); + + + + + + + + unsigned int vec_first_match_or_eos_index (vector signed + int, vector signed int); + + + + + + + + unsigned int vec_first_match_or_eos_index (vector unsigned + int, vector unsigned int); + + + + + + + + unsigned int vec_first_match_or_eos_index (vector signed + short, vector signed short); + + + + + + + + unsigned int vec_first_match_or_eos_index (vector unsigned + short, vector unsigned short); + + + + + VEC_FIRST_MISMATCH_INDEX(ARG1, ARG2) + + + Purpose: + Performs a comparison of inequality on each of the + corresponding elements of ARG1 and ARG2, and returns the first + position of inequality. + Result value: + Returns the element index of the position of the first + character mismatch. If no mismatch, returns the number of + characters as an element count in the vector argument. + + + + + + + + unsigned int vec_first_mismatch_index (vector signed char, + vector signed char); + + + + + + + + unsigned int vec_first_mismatch_index (vector unsigned + char, vector unsigned char); + + + + + + + + unsigned int vec_first_mismatch_index (vector signed int, + vector signed int); + + + + + + + + unsigned int vec_first_mismatch_index (vector unsigned int, + vector unsigned int); + + + + + + + + unsigned int vec_first_mismatch_index (vector signed short, + vector signed short); + + + + + + + + unsigned int vec_first_mismatch_index (vector unsigned + short, vector unsigned short); + + + + + VEC_FIRST_MISMATCH_OR_ EOS_INDEX (ARG1, ARG2) + + + Purpose: + Performs a comparison of inequality on each of the + corresponding elements of ARG1 and ARG2. Returns the first + position of inequality, or the zero string terminator. + Result value: + Returns the element index of the position of either the + first character mismatch or an end-of-string (EOS) terminator. If + no mismatch or terminator, returns the number of characters as an + element count in the vector argument. + + + + + + + + unsigned int vec_first_mismatch_or_eos_index (vector signed + char, vector signed char); + + + + + + + + unsigned int vec_first_mismatch_or_eos_index (vector + unsigned char, vector unsigned char); + + + + + + + + unsigned int vec_first_mismatch_or_eos_index (vector signed + int, vector signed int); + + + + + + + + unsigned int vec_first_mismatch_or_eos_index (vector + unsigned int, vector unsigned int); + + + + + + + + unsigned int vec_first_mismatch_or_eos_index (vector signed + short, vector signed short); + + + + + + + + unsigned int vec_first_mismatch_or_eos_index (vector + unsigned short, vector unsigned short); + + + + + VEC_FLOAT (ARG1) + + + Purpose: + Converts a vector of integers to a vector of + single-precision floating-point numbers. + Result value: + Target elements are obtained by converting the respective + source elements to unsigned integers. + + + + + + + + vector float + vec_float (vector signed int); + + + + + + + + vector float + vec_float (vector unsigned int); + + + + + VEC_FLOAT2 (ARG1, ARG2) + + + Purpose: + Converts + an input vectora vector of integers + to a vector of single-precision + numbers floating-point numbers. + Result value: + Target elements are obtained by converting the source + elements to single-precision numbers as follows: + + + Target elements 0 and 1 from source 0 + Target elements 2 and 3 from source 1 @@ -2750,6 +3431,52 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> vector float vec_floate (vector double); + + + VEC_FLOATH (ARG2) + Phased in. + + + + Purpose: + Converts a vector to a vector of single-precision + floating-point numbers. + Result value: + Target elements 0 through 3 are set to the converted values + of source elements 0 through 3, respectively. + + + + + + + + vector float vec_floath (vector _Float16); + + + + + VEC_FLOATL (ARG2) + Phased in. + + + + Purpose: + Converts a vector to a vector of single-precision + floating-point numbers. + Result value: + Target elements 0 through 3 are set to the converted values + of source elements 4 through 7, respectively. + + + + + + + + vector float vec_floatl (vector _Float16); + + VEC_FLOATO (ARG2) @@ -2878,75 +3605,172 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - + + + + vector signed int vec_insert (signed int, vector signed + int, signed int); + + + + + + + + vector unsigned int vec_insert (unsigned int, vector + unsigned int, signed int); + + + + + + + + vector signed long long vec_insert (signed long long, + vector signed long long, signed int); + + + + + + + + vector unsigned long long vec_insert (unsigned long long, + vector unsigned long long, signed int); + + + + + + + + vector signed short vec_insert (signed short, vector signed + short,. + signed int); + + + + + + + + vector unsigned short vec_insert (unsigned short, vector + unsigned short, signed int); + + + + + + + + vector double vec_insert (double, vector double, signed + int); + + + + + + + + vector float vec_insert (float, vector float, signed + int); + + + + + Phased in. + + + + vector _Float16 vec_insert (_Float16, vector _Float16, + signed int); + + + + + VEC_INSERT_EXP (ARG1, ARG2) - vector signed int vec_insert (signed int, vector signed - int, signed int); + Purpose: + Inserts an exponent into a floating-point number. + Result value: + Each element of the returned floating-point vector is + generated by combining the exponent specified by the + corresponding element of ARG2 with the sign and significand of + the corresponding element of ARG1. + The inserted exponent of ARG2 is treated as a + right-justified unsigned integer containing a biased exponent, in + accordance with the exponent representation specified by IEEE + 754. It is combined with the sign and significand of ARG1 without + further processing. - + - + - vector unsigned int vec_insert (unsigned int, vector - unsigned int, signed int); + vector double vec_insert_exp (vector double, vector + unsigned long long); - + - + - vector signed long long vec_insert (signed long long, - vector signed long long, signed int); + vector double vec_insert_exp (vector unsigned long long, + vector unsigned long long); - + - + - vector unsigned long long vec_insert (unsigned long long, - vector unsigned long long, signed int); + vector float vec_insert_exp (vector float, vector unsigned + int); - + - + - vector signed short vec_insert (signed short, vector signed - short,. - signed int); + vector float vec_insert_exp (vector unsigned int, vector + unsigned int); - + - + VEC_INSERT4B (ARG1, ARG2, ARG3) - vector unsigned short vec_insert (unsigned short, vector - unsigned short, signed int); + Purpose: + Inserts a word into a vector at a byte position. + Result value: + The first doubleword element of the result contains the + zero-extended extracted word from ARG1. The second doubleword is + set to 0. ARG2 specifies the least-significant byte (0 - 12) of + the extracted word. - + - + - vector double vec_insert (double, vector double, signed - int); + vector unsigned char vec_insert4b (vector signed int, + vector unsigned char, const int) - + - + - vector float vec_insert (float, vector float, signed - int); + vector unsigned char vec_insert4b (vector unsigned int, + vector unsigned char, const int) @@ -3186,8 +4010,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3197,8 +4020,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3208,8 +4030,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3219,8 +4040,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3230,8 +4050,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3241,8 +4060,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3252,8 +4070,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3263,8 +4080,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3362,8 +4178,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3425,6 +4240,16 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> float); + + + Phased in. + + + + vector _Float16 vec_mergeh (vector _Float16, vector + _Float16); + + VEC_MERGEL (ARG1, ARG2) @@ -3497,8 +4322,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3517,8 +4341,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3580,6 +4403,16 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> float); + + + Phased in. + + + + vector _Float16 vec_mergel (vector _Float16, vector + _Float16); + + VEC_MERGEO (ARG1, ARG2) @@ -3596,8 +4429,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3607,8 +4439,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3618,8 +4449,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3629,8 +4459,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3640,8 +4469,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3651,8 +4479,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3662,8 +4489,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3673,8 +4499,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3988,8 +4813,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -3999,8 +4823,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -4620,8 +5443,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -4631,8 +5453,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -4760,8 +5581,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -4771,8 +5591,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -5101,6 +5920,41 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> double); + + + Phased in. + + + + vector _Float16 vec_pack (vector float, vector + float); + + + + + VEC_PACK_TO_SHORT_FP32 (ARG1, ARG2) + + + Purpose: + Packs eight single-precision 32-bit floating-point numbers + into a vector of eight 16-bit floating-point numbers. + Result value: + The value is a vector consisting of eight 16-bit elements, + each representing a 16-bit floating-point number that was created + by converting the corresponding single-precision value to + half-precision. + + + + + + + + vector unsigned short vec_pack_to_short_fp32 (vector float, + vector float); + + + VEC_PACKPX (ARG1, ARG2) @@ -5246,8 +6100,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -5273,6 +6126,73 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> vector unsigned int); + + + VEC_PARITY_LSBB (ARG1) + + + Purpose: + Compute parity on the least-significant bit of each + byte. + Result value: + Returns a vector with each element containing the parity of + the low-order bit of each of the bytes in that element. + + + + + + + + vector unsigned int vec_parity_lsbb (vector signed + int); + + + + + + + + vector unsigned int vec_parity_lsbb (vector unsigned + int); + + + + + + + + vector unsigned __int128 vec_parity_lsbb (vector + signed__int128); + + + + + + + + vector unsigned __int128 vec_parity_lsbb (vector + unsigned__int128); + + + + + + + + vector unsigned long long vec_parity_lsbb (vector signed + long long); + + + + + + + + vector unsigned long long vec_parity_lsbb (vector unsigned + long long); + + VEC_PERM (ARG1, ARG2, ARG3) @@ -5343,8 +6263,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -5363,8 +6282,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -5426,6 +6344,16 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> unsigned char); + + + Phased in. + + + + vector _Float16 vec_perm (vector _Float16, vector _Float16, + vector unsigned char); + + VEC_PERMXOR (ARG1, ARG2, ARG3) @@ -5443,8 +6371,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -5454,8 +6381,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -5465,8 +6391,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -5777,6 +6702,15 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> vector float vec_revb (vector float); + + + Phased in. + + + + vector _Float16 vec_revb (vector _Float16); + + VEC_REVE (ARG1) @@ -5906,6 +6840,15 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> vector float vec_reve (vector float); + + + Phased in. + + + + vector _Float16 vec_reve (vector _Float16); + + VEC_RINT (ARG1) @@ -6026,6 +6969,76 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> unsigned short); + + + VEC_RLMI (ARG1, ARG2, ARG3) + + + Purpose: + Rotates each element of a vector left and inserts each + element under a mask. + Result value: + The result is obtained by rotating each element of vector + ARG1 left and inserting it under mask into ARG2. ARG3 bits 11:15 + contain the mask beginning, bits 19:23 contain the mask end, and + bits 27:31 contain the shift count. + + + + + + + + vector unsigned int vec_rlmi (vector unsigned int, vector + unsigned int, vector unsigned int); + + + + + + + + vector unsigned long long vec_rlmi (vector unsigned long + long, vector unsigned long long, vector unsigned long + long); + + + + + VEC_RLNM (ARG1, ARG2, ARG3) + + + Purpose: + Rotates each element of a vector left; then intersects + (AND) it with a mask. + Result value: + Each element of vector ARG1 is rotated left; then + intersected (AND) with a mask specified by ARG3. + ARG3 contains the mask begin, mask end, and shift count for + each element. The shift count is in the low-order byte, the mask + end is in the next higher byte, and the mask begin is in the next + higher byte. + + + + + + + + vector unsigned int vec_rlnm (vector unsigned int, vector + unsigned int, vector unsigned int); + + + + + + + + vector unsigned long long vec_rlnm (vector unsigned long + long, vector unsigned long long, vector unsigned long + long); + + VEC_ROUND (ARG1) @@ -6039,20 +7052,17 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> corresponding element of ARG1, rounded to the nearest representable floating-point integer, using IEEE round-to-nearest rounding. - - - Note: This function might not follow the strict + + This function might not follow the strict operation definition of the resolution of a tie during a round if the -qstrict=nooperationprecision compiler option is specified. - - + - Phased in. - + Phased in. @@ -6282,8 +7292,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -6293,8 +7302,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -6304,8 +7312,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -6395,6 +7402,26 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> unsigned int); + + + Phased in. + + + + vector _Float16 vec_sel (vector _Float16, vector _Float16, + vector bool short); + + + + + Phased in. + + + + vector _Float16 vec_sel (vector _Float16, vector _Float16, + vector unsigned short); + + VEC_SIGNED (ARG1) @@ -6669,8 +7696,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -6680,8 +7706,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -6691,8 +7716,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -6738,8 +7762,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -6767,10 +7790,13 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> Result value: The value of each element is set to the value of an input element of the concatenated vectors ARG1 and ARG2, with the word - offset to its right - 1 specified by ARG3, which should be in the + offset to its right specified by ARG3, which should be in the range 0 - 3. - 1. A shift left picks values from the right. + + + A shift left picks values from the right. + + @@ -6898,8 +7924,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -6909,8 +7934,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -6957,10 +7981,14 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> The result is the contents of ARG1, shifted left by the number of bytes specified by the most-significant nibble of the least-significant byte - 1 of ARG2. The bits that are shifted out are + of ARG2. The bits that are shifted out are replaced by zeros. - 1. That is, by little-endian bits 7- 5 or big-endian bits - 121 - 124. + + + That is, by little-endian bits 7 - 5 or big-endian bits + 121 - 124. + + @@ -7125,22 +8153,53 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> vector unsigned char); - + + + + + + vector float vec_slo (vector float, vector signed + char); + + + + + + + + vector float vec_slo (vector float, vector unsigned + char); + + + - + VEC_SLV (ARG1, ARG2) - vector float vec_slo (vector float, vector signed - char); + Purpose: + Left-shifts a vector by a varying number of bits by + element. + Result value: + For each integer 0i14, let + Xi be the halfword formed by concatenating + elements i and i+1 of ARG1. Let + X15 be the halfword formed by concatenating + element 15 of ARG1 with a zero byte. + Let Si be the value in the three least-significant + bits of element i of ARG2. Then, element i of the result vector + contains the value formed from bits + Si through + Si + 7 of + Xi. - + - + - vector float vec_slo (vector float, vector unsigned - char); + vector unsigned char vec_slv (vector unsigned char, vector + unsigned char); @@ -7214,8 +8273,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -7278,8 +8336,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -7294,6 +8351,16 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> vector float vec_splat (vector float, const int); + + + Phased in. + + + + vector _Float16 vec_splat (vector _Float16, const + int); + + VEC_SPLAT_S8 (ARG1) @@ -7530,6 +8597,15 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> vector float vec_splats (float); + + + Phased in. + + + + vector _Float16 vec_splats (_Float16); + + VEC_SQRT (ARG1) @@ -7790,8 +8866,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -7801,8 +8876,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -7925,8 +8999,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -7936,8 +9009,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -7947,8 +9019,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -7958,8 +9029,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -8039,6 +9109,38 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> char); + + + VEC_SRV (ARG1, ARG2) + + + Purpose: + Right-shifts a vector by a varying number of bits by + element. + Result value: + For each integer 1 + i + 15, let X + i be the halfword formed by concatenating + elements i and i+1 of ARG1. Let X + 0 be the halfword formed by concatenating a + zero byte with element 0 of ARG1. Let S + i be the value in the three least-significant + bits of element i of ARG2. Then element i of the result vector + contains the value formed from bits 8 - S + i through 15 - S + i. + + + + + + + + vector unsigned char vec_srv (vector unsigned char, vector + unsigned char); + + VEC_SUB (ARG1, ARG2) @@ -8482,6 +9584,40 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> signed int); + + + VEC_TEST_DATA_CLASS (ARG1, ARG2) + + + Purpose: + Determines the data class for each floating-point + element. + Result value: + Each element is set to all ones if the corresponding + element of ARG1 matches one of the possible data types selected + by ARG2. If not, each element is set to all zeros. ARG2 can + select one of the data types defined in + . + + + + + + + + vector bool int vec_test_data_class (vector float, const + int); + + + + + + + + vector bool long long vec_test_data_class (vector double, + const int); + + VEC_TRUNC (ARG1) @@ -8615,14 +9751,22 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. vector double vec_unpackh (vector float); + + + Phased in. + + + + vector float vec_unpackh (vector _Float16); + + VEC_UNPACKL (ARG1) @@ -8726,14 +9870,22 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. vector double vec_unpackl (vector float); + + + Phased in. + + + + vector float vec_unpackl (vector _Float16); + + VEC_UNSIGNED (ARG1) @@ -8859,15 +10011,13 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> Result value: This function adds the displacement and the pointer R-value to obtain the address for the load operation. - - - Important Note: For languages that support built-in - methods for pointer dereferencing, such as the C/C++ pointer - dereference * and array access [] operators, use of the - native operators is encouraged and use of the vec_xl - intrinsic is discouraged. - - + + For languages that support built-in + methods for pointer dereferencing, such as the C/C++ pointer + dereference * and array access [ ] operators, use of the + native operators is encouraged and use of the vec_xl + intrinsic is discouraged. + @@ -8937,170 +10087,353 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - vector unsigned long long vec_xl (long long, unsigned long - long *); + vector unsigned long long vec_xl (long long, unsigned long + long *); + + + + + + + + vector signed short vec_xl (long long, signed short + *); + + + + + + + + vector unsigned short vec_xl (long long, unsigned short + *); + + + + + + + + vector double vec_xl (long long, double *); + + + + + + + + vector float vec_xl (long long, float *); + + + + + Phased in. + + + + vector _Float16 vec_xl (long long, _Float16 *); + + + + + VEC_XL_BE (ARG1. ARG2) + + + Purpose: + In little-endian environments, loads the elements of the + 16-byte vector ARG1 starting with the highest-numbered element at + the memory address specified by the displacement ARG1 and the + pointer ARG2. In big-endian environments, this operator performs + the same operation as VEC_XL. + Result value: + In little-endian mode, loads the elements of the vector in + sequential order, with the highest-numbered element loaded from + the lowest data address and the lowest-numbered element of the + vector at the highest address. All elements are loaded in + little-endian data format. + This function adds the displacement and the pointer R-value + to obtain the address for the load operation. It does not + truncate the affected address to a multiple of 16 bytes. + + + + + + + + vector signed char vec_xl_be (long long, signed char + *); + + + + + + + + vector unsigned char vec_xl_be (long long, unsigned char + *); + + + + + + + + vector signed int vec_xl_be (long long, signed int + *); + + + + + + + + vector unsigned int vec_xl_be (long long, unsigned int + *); + + + + + + + + vector signed __int128 vec_xl_be (long long, signed + __int128 *); + + + + + + + + vector unsigned __int128 vec_xl_be (long long, unsigned + __int128 *); + + + + + + + + vector signed long long vec_xl_be (long long, signed long + long *); + + + + + + + + vector unsigned long long vec_xl_be (long long, unsigned + long long *); + + + + + + + + vector signed short vec_xl_be (long long, signed short + *); + + + + + + + + vector unsigned short vec_xl_be (long long, unsigned short + *); + + + + + + + + vector double vec_xl_be (long long, double *); + + + + + + + + vector float vec_xl_be (long long, float *); - + - + Phased in. + - vector signed short vec_xl (long long, signed short - *); + vector _Float16 vec_xl_be (long long, _Float16 *); - + - + VEC_XL_LEN (ARG1, ARG2) - vector unsigned short vec_xl (long long, unsigned short - *); + Purpose: + Loads a vector of a specified byte length. + Result value: + Loads the number of bytes specified by ARG2 from the + address specified in ARG1. Initializes elements in order from the + byte stream (as defined by the endianness of the operating + environment). Any bytes of elements that cannot be initialized + from the number of loaded bytes have a zero value. + At least 0 and at most 16 bytes will be loaded. The length + is specified by the least-significant byte of ARG2, as min (mod + (ARG2, 256), 16). The behavior is undefined if the length + argument is outside of the range 0 - 255, or if it is not a + multiple of the vector element size. - + - + - vector double vec_xl (long long, double *); + vector signed char vec_xl_len (signed char *, + size_t); - + - + - vector float vec_xl (long long, float *); + vector unsigned char vec_xl_len (unsigned char *, + size_t); - + - VEC_XL_BE (ARG1. ARG2) + - Purpose: - In little-endian environments, loads the elements of the - 16-byte vector ARG1 starting with the highest-numbered element at - the memory address specified by the displacement ARG1 and the - pointer ARG2. In big-endian environments, this operator performs - the same operation as VEC_XL. - Result value: - In little-endian mode, loads the elements of the vector in - sequential order, with the highest-numbered element loaded from - the lowest data address and the lowest-numbered element of the - vector at the highest address. All elements are loaded in - little-endian data format. - This function adds the displacement and the pointer R-value - to obtain the address for the load operation. It does not - truncate the affected address to a multiple of 16 bytes. + vector signed int vec_xl_len (signed int *, size_t); - + - + - vector signed char vec_xl_be (long long, signed char - *); + vector unsigned int vec_xl_len (unsigned int *, + size_t); - + - + - vector unsigned char vec_xl_be (long long, unsigned char - *); + vector signed __int128 vec_xl_len (signed __int128 *, + size_t); - + - + - vector signed int vec_xl_be (long long, signed int - *); + vector unsigned __int128 vec_xl_len (unsigned __int128 *, + size_t); - + - + - vector unsigned int vec_xl_be (long long, unsigned int - *); + vector signed long long vec_xl_len (signed long long *, + size_t); - + - + - vector signed __int128 vec_xl_be (long long, signed - __int128 *); + vector unsigned long long vec_xl_len (unsigned long long *, + size_t); - + - + - vector unsigned __int128 vec_xl_be (long long, unsigned - __int128 *); + vector signed short vec_xl_len (signed short *, + size_t); - + - + - vector signed long long vec_xl_be (long long, signed long - long *); + vector unsigned short vec_xl_len (unsigned short *, + size_t); - + - + - vector unsigned long long vec_xl_be (long long, unsigned - long long *); + vector double vec_xl_len (double *, size_t); - + - + - vector signed short vec_xl_be (long long, signed short - *); + vector float vec_xl_len (float *, size_t); - + - + - vector unsigned short vec_xl_be (long long, unsigned short - *); + vector _Float16 vec_xl_len (_Float16 *, size_t); - + - + VEC_XL_LEN_R (ARG1, ARG2) - vector double vec_xl_be (long long, double *); + Purpose + Loads a vector of a specified byte length, + right-justified. + Result value: + Loads the number of bytes specified by ARG2 from the + address specified in ARG1, right justified with the first byte to + the left and the last to the right. Initializes elements in order + from the byte stream (as defined by the endianness of the + operating environment). Any bytes of elements that cannot be + initialized from the number of loaded bytes have a zero + value. + At least 0 and at most 16 bytes will be loaded. The length + is specified by the least-significant byte of ARG2, as min (mod + (ARG2, 256), 16). The behavior is undefined if the length + argument is outside of the range 0 - 255, or if it is not a + multiple of the vector element size. - + - + - vector float vec_xl_be (long long, float *); + vector unsigned char vec_xl_len_r (unsigned char *, + size_t); @@ -9179,8 +10512,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -9190,8 +10522,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -9254,15 +10585,13 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> address provided. Result value: Stores the provided vector in memory. - - - Important Note: For languages that support built-in - methods for pointer dereferencing, such as the C/C++ pointer - dereference * and array access [] operators, use of the - native operators is encouraged and use of the vec_xl - intrinsic is discouraged. - - + + For languages that support built-in + methods for pointer dereferencing, such as the C/C++ pointer + dereference * and array access [ ] operators, use of the + native operators is encouraged and use of the vec_xl + intrinsic is discouraged. + @@ -9371,6 +10700,16 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> void vec_xst (vector float, long long, float *); + + + Phased in. + + + + void vec_xst (vector _Float16, long long, _Float16 + *); + + VEC_XST_BE (ARG1, ARG2, ARG3) @@ -9500,6 +10839,180 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> void vec_xst_be (vector float, long long, float *); + + + Phased in. + + + + void vec_xst_be (vector _Float16, long long, _Float16 + *); + + + + + VEC_XST_LEN (ARG1, ARG2, ARG3) + + + Purpose: + Stores a vector of a specified byte length. + Result value: + Stores the number of bytes specified by ARG3 of the vector + ARG1 to the address specified in ARG2. The bytes are obtained + starting from the lowest-numbered byte of the lowest-numbered + element (as defined by the endianness of the operating + environment). All bytes of an element are accessed before + proceeding to the next higher element. + At least 0 and at most 16 bytes will be stored. The length + is specified by the least-significant byte of ARG3, as min (mod + (ARG2, 256), 16). The behavior is undefined if the length + argument is outside of the range 0 - 255, or if it is not a + multiple of the vector element size. + + + + + + + + void vec_xst_len (vector signed char, signed char *, + size_t); + + + + + + + + void vec_xst_len (vector unsigned char, unsigned char *, + size_t); + + + + + + + + void vec_xst_len (vector signed int, signed int *, + size_t); + + + + + + + + void vec_xst_len (vector unsigned int, unsigned int *, + size_t); + + + + + + + + void vec_xst_len (vector signed __int128, signed __int128 + *, size_t); + + + + + + + + void vec_xst_len (vector unsigned __int128, unsigned + __int128 *, size_t); + + + + + + + + void vec_xst_len (vector signed long long, signed long long + *, size_t); + + + + + + + + void vec_xst_len (vector unsigned long long, unsigned long + long *, size_t); + + + + + + + + void vec_xst_len (vector signed short, signed short *, + size_t); + + + + + + + + void vec_xst_len (vector unsigned short, unsigned short *, + size_t); + + + + + + + + void vec_xst_len (vector double, double *, size_t); + + + + + + + + void vec_xst_len (vector float, float *, size_t); + + + + + + + + void vec_xst_len (vector _Float16, _Float16 *, + size_t); + + + + + VEC_XST_LEN_R (ARG1, ARG2, ARG3) + + + Purpose: + Stores a right-justified vector of a specified byte + length. + Result value: + Stores the number of bytes specified by ARG3 of the + right-justified vector ARG1 to the address specified by + ARG2. + At least 0 and at most 16 bytes will be stored. The length + is specified by the least-significant byte of ARG3, as min (mod + (ARG2, 256), 16). The behavior is undefined if the length + argument is outside of the range 0 - 255, or if it is not a + multiple of the vector element size. + + + + + + + + void vec_xst_len_r (vector unsigned char, unsigned char *, + size_t); + + @@ -10078,8 +11591,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - This optional function is being + Phased in.This optional function is being phased in, and it might not be available on all implementations. Phased-in interfaces are optional for the current generation of compliant systems. @@ -11363,9 +12875,9 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> be used to access vectors with big-endian element ordering regardless of the data layout of the execution environment. Alternatively, in a little-endian environment, big-endian element ordering may be established - by using the vec_reve() vector operator. In a little-endian environment, + by using the vec_reve( ) vector operator. In a little-endian environment, big-endian byte order within each element may be established by using the - vec_revb() vector operator. + vec_revb( ) vector operator.
Finite Field Arithmetic and Secure Hashing The vector operators listed in @@ -11420,8 +12932,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. This optional function is being phased in and it might not be available on all implementations. @@ -11433,8 +12944,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -11444,8 +12954,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -11455,8 +12964,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -11517,8 +13025,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -11528,8 +13035,7 @@ xml:id="dbdoclet.50655245_pgfId-1138128"> - Phased in. - + Phased in. @@ -11597,8 +13103,7 @@ vec_xst_be(result,0, &le_result); - Phased in. - This optional function is being + Phased in.This optional function is being phased in and it might not be available on all implementations @@ -11625,8 +13130,7 @@ vec_xst_be(result,0, &le_result); - Phased in. - + Phased in. @@ -11652,8 +13156,7 @@ vec_xst_be(result,0, &le_result); - Phased in. - + Phased in. @@ -11679,8 +13182,7 @@ vec_xst_be(result,0, &le_result); - Phased in. - + Phased in. @@ -11706,8 +13208,7 @@ vec_xst_be(result,0, &le_result); - Phased in. - + Phased in. @@ -11962,15 +13463,13 @@ vec_xst_be(result,0, &le_result); (subject to being supported by all targeted implementations of the Power SIMD environment), the use of type-generic built-in names is recommended. - - - Note: The type-specific vector built-in types are provided for + + The type-specific vector built-in types are provided for legacy code compatibility only. The functions are deprecated, and support may be discontinued in the future. It is recommended that programmers use the respective overloaded vector built-in functions in conjunction with the appropriate vector type. - - + Functions Provided for Compatibility diff --git a/specification/app_b.xml b/specification/app_b.xml index d503cdc..872e88e 100644 --- a/specification/app_b.xml +++ b/specification/app_b.xml @@ -18,9 +18,8 @@ xml:id="dbdoclet.50655245_pgfId-1450875" revisionflag="added"> BCD values are stored in memory as contiguous arrays of 1 - 16 bytes. - BCD built-in functions are valid only when - - march or - - qarch is set to target POWER8 processors or + BCD built-in functions are valid only when -march or + -qarch is set to target POWER8 processors or later. @@ -424,8 +423,7 @@ xml:id="dbdoclet.50655245_pgfId-1450875" revisionflag="added"> - Macro - Or static inline function. + MacroOr static inline function. @@ -529,8 +527,7 @@ xml:id="dbdoclet.50655245_pgfId-1450875" revisionflag="added"> bcd_xl(a,b) - (bcd)vec_xl_len_r(a,b) - Optionaly, __builtin_ldrmb (a,b) for previous + (bcd)vec_xl_len_r(a,b)Optionaly, __builtin_ldrmb (a,b) for previous generations of XL compilers. @@ -540,8 +537,7 @@ xml:id="dbdoclet.50655245_pgfId-1450875" revisionflag="added"> bcd_xst(a,b) - (bcd)vec_xst_len_r(a,b) - Optionaly, __builti_strmb (a,b) for previous + (bcd)vec_xst_len_r(a,b)Optionaly, __builti_strmb (a,b) for previous generatoin f XL compilers. diff --git a/specification/ch_1.xml b/specification/ch_1.xml index a06ee9f..497c59e 100644 --- a/specification/ch_1.xml +++ b/specification/ch_1.xml @@ -27,7 +27,7 @@ ). OpenPOWER-compliant processors in the 64-bit Power Architecture can execute in either big-endian or little-endian mode. Executables and - executable-generated data (in general) that subscribes to either byte + executable-generated data (in general) that subscribe to either byte ordering is not portable to a system running in the other mode. @@ -155,8 +155,7 @@ - ELF Assembly User’s Guide, Fourth edition, IBM, 2000. - + ELF Assembly User’s Guide, Fourth edition, IBM, 2000. diff --git a/specification/ch_2.xml b/specification/ch_2.xml index 9bd4824..965b474 100644 --- a/specification/ch_2.xml +++ b/specification/ch_2.xml @@ -1588,16 +1588,16 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> - 3 + - + 3 - 4 + @@ -2057,7 +2057,7 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> types, pointers to both aligned and unaligned data of each data type shall return the value corresponding to a data type starting at the specified address when accessed with either the pointer dereference - operator * or the array reference operator []. + operator * or the array reference operator [ ].
@@ -2378,7 +2378,7 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> - any (*) () + any (*) ( ) Function pointer @@ -2658,8 +2658,7 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> - vector unsigned long - + vector unsigned long The vector long types are deprecated due to their ambiguity between 32-bit and 64-bit environments. The use of the vector long long types is preferred. @@ -2681,8 +2680,7 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> - vector signed long - + vector signed long vector signed long long @@ -2700,8 +2698,7 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> - vector bool long - + vector bool long vector bool long long @@ -2807,8 +2804,7 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> Elements of Boolean vector data types must have a value corresponding to all bits set to either 0 or 1. The result of computations on Boolean vectors, where at least one element is not - well formed - + well formed An element is well formed if it has all bits set to 0 or all bits set to 1. , is undefined for all vector elements. @@ -3517,16 +3513,16 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> 6 - 15 + - + 5 - 5 + @@ -3637,7 +3633,7 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> - 2 + 1 @@ -3646,7 +3642,7 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> - 1 + 0 @@ -4075,7 +4071,7 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> Programs and compilers may freely use all registers except those reserved for system use. The system signal handlers are responsible for preserving the original values upon return to the original execution - path. Signals + path. Signals that can interrupt the original execution path are documented in the System V Interface Definition (SVID). The tables in @@ -4214,8 +4210,7 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> r2 - Nonvolatile - + Nonvolatile Register r2 is nonvolatile with respect to calls between functions in the same compilation unit. It is saved and restored by code inserted by the linker resolving a @@ -4278,8 +4273,7 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> - r14 - r31 - + r14 - r31 If a function needs a frame pointer, assigning r31 to the role of the frame pointer is recommended. @@ -4491,31 +4485,30 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> - - NoteErratum: - When executing an - mfocr instruction, the POWER8 processor does not - implement the behavior described in the "Fixed-Point Invalid Forms - and Undefined Conditions" section of - POWER8 Processor User's Manual for the Single-Chip - Module. Instead, it replicates the selected condition - register field within the byte that contains it rather than - initializing to 0 the bits corresponding to the nonselected bits of - the byte that contains it. When generating code to save two condition - register fields that are stored in the same byte, the compiler must - mask the value received from - mfocr to avoid corruption of the resulting - (partial) condition register word. - This erratum does not apply to the POWER9 processor. + + Erratum: + When executing an + mfocr instruction, the POWER8 processor does not + implement the behavior described in the "Fixed-Point Invalid Forms + and Undefined Conditions" section of + POWER8 Processor User's Manual for the Single-Chip + Module. Instead, it replicates the selected condition + register field within the byte that contains it rather than + initializing to 0 the bits corresponding to the nonselected bits of + the byte that contains it. When generating code to save two condition + register fields that are stored in the same byte, the compiler must + mask the value received from + mfocr to avoid corruption of the resulting + (partial) condition register word. + This erratum does not apply to the POWER9 processor. + - - For more information, see + For more information, see Power ISA, version 3.0 and "Fixed-Point Invalid Forms and Undefined Conditions" in POWER9 Processor User's Manual. - - In + In OpenPOWER-compliant processors, floating-point and vector functions are implemented using a unified vector-scalar model. As shown in and @@ -4528,310 +4521,26 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> with VMX instructions to refer to a 32-register subset of 128-bit wide registers. -
- Floating-Point Registers as Part of VSRs - - - - - - - - - - - VSR(0) - - - FPR[0] - - - - - - - - - - - VSR(1) - - - FPR1] - - - - - - - - - - - - - - - ... - - - ... - - - - - - - - - - - - VSR(30) - - - FPR[30] - - - - - - - - - - - VSR(31) - - - FP[31] - - - - - - - - - - - VSR(32) - - - - - - - - - - - VSR(33) - - - - - - - - - - - - - - - - - - ... - - - ... - - - - - - - VSR(62) - - - - - - - - - - - VSR(63) - - - - - - - - - - - - - - 0 - - - 63 - - - 127 - - - 255 - - - - - -
+
+ Floating-Point Registers as Part of VSRs + + + + + +
+ +
+ Vector Registers as Part of VSRs + + + + + +
- - Vector Registers as Part of VSRs - - - - - - - - - VSR(0) - - - - - - - - - - - VSR(1) - - - - - - - - - - - - - - - - ... - - - ... - - - - - - - - - VSR(30) - - - - - - - - - - - VSR(31) - - - - - - - - - - - VSR(32) - - - VR[0] - - - - - - VSR(33) - - - VR[1] - - - - - - - - - - - ... - - - ... - - - - - - - VSR(62) - - - VR[30] - - - - - - VSR(63) - - - VR[31] - - - - - - - - - 0 - - - 127 - - - - - -
The classic floating-point repertoire consists of 32 floating-point registers, each 64 bits wide, and an associated special-purpose register to provide floating-point status and control. @@ -5208,11 +4917,11 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> Limited-Access Conditions Standard library functions expressly defined to change the state of limited-access bits are not constrained by nonvolatile preservation - rules; for example, the fesetround() and feenableexcept() functions. + rules; for example, the fesetround( ) and feenableexcept( ) functions. All other standard library functions shall save the old value of these bits on entry, change the bits for their purpose, and restore the bits before returning. - Where a standard library function, such as qsort(), calls + Where a standard library function, such as qsort( ), calls functions provided by an application, the following rules shall be observed: @@ -5238,19 +4947,19 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> into a signal handler because a library or user function can temporarily modify the limited-access bits when the signal is taken. - When setjmp() returns from its first call (also known as direct + When setjmp( ) returns from its first call (also known as direct invocation), it does not change the limited access bits. The limited - access bits have the values they had on entry to the setjmp() + access bits have the values they had on entry to the setjmp( ) function. - When longjmp() is performed, it appears to be returning from a - call to setjmp(). In this instance, the limited access bits are not - restored to the values they had on entry to the setjmp() + When longjmp( ) is performed, it appears to be returning from a + call to setjmp( ). In this instance, the limited access bits are not + restored to the values they had on entry to the setjmp( ) function. - C library functions, such as _FPU_SETCW() defined in + C library functions, such as _FPU_SETCW( ) defined in <fpu_control.h>, may modify the limited-access bits of the FPSCR. Additional C99 functions that can modify the FPSCR are defined in <fenv.h>. - The vector vec_mtvscr() function may change the limited-access NJ + The vector vec_mtvscr( ) function may change the limited-access NJ bit. The unwinder does not modify limited-access bits. To avoid the overhead of saving and restoring the FPSCR on every call, it is only @@ -5312,7 +5021,7 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> In - the white areas indicate an + the white areas indicate an optional save area of the stack frame. For a description of the optional save areas described by this ABI, see . @@ -5452,6 +5161,15 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> If a function changes the value in any nonvolatile floating-point register fN, it shall first save the value in fN in the Floating-Point Register Save Area and restore the register upon function exit. + If full unwind information such as + DWARF is present, registers can be + saved in arbitrary locations in the stack + frame. If the system floating-point register save and restore + functions are to be used, the floating-point registers + shall be saved in a contiguous range. Floating-point register fN + is saved in the doubleword located 8 × (32 – N) bytes before the back-chain + word of the previous frame, as shown in + The Floating-Point Register Save Area is always doubleword aligned. The size of the Floating-Point Register Save Area depends upon the number of floating-point registers that must be saved. If no @@ -5464,15 +5182,14 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> general-purpose register rN, it shall first save the value in rN in the General-Purpose Register Save Area and restore the register upon function exit. - If full unwind information such as - DWARF is present, registers can be + If full unwind information such as DWARF is present, registers can be saved in arbitrary locations in the stack frame. If the system floating-point register save and restore functions are to be used, the - floading-point registers shall be saved in a contiguous range. + floating-point registers shall be saved in a contiguous range. Floating-point register fN is saved in the doubleword located 8 x (32-N) bytes before the back-chain word of the previous frame, as shown in - + . The General-Purpose Register Save Area is always doubleword aligned. The size of the General-Purpose Register Save Area depends upon the number of general registers that must be saved. If no @@ -5483,15 +5200,14 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> If a function changes the value in any nonvolatile vector register vN, it shall first save the value in vN in the Vector Register Save Area and restore the register upon function exit. - If full unwind information such as - DWARF is present, registers can be + If full unwind information such as DWARF is present, registers can be saved in arbitrary locations in the stack frame. If the system vector register save and restore functions are to be used, the vector registers shall be saved in a contiguous range. Vector register vN is saved in the doubleword located 16 x (32-N) bytes before the General-Purpose Register Save Areas plus alignment padding, as shown in - + . The Vector Register Save Area is always quadword aligned. If necessary to ensure suitable alignment of the vector save area, a padding doubleword may be introduced between the vector register and @@ -5599,8 +5315,7 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> When 128-bit integer types are passed by value, map each to two consecutive GPRs, two consecutive doublewords, or a GPR and a - doubleword. - + doubleword. In big-endian environments, the most-significant doubleword of the quadword (__int128) parameter is stored in the lower numbered GPR or parameter word. The least-significant doubleword @@ -5610,7 +5325,7 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> is stored in the lower numbered GPR or parameter word. The most-significant doubleword of the quadword (__int128) is stored in the higher numbered GPR or parameter word. - The required alignment of int128 data types is 16 bytes. + The required alignment of int128 data types is 16 bytes. Therefore, by-value parameters must be copied to a new location in the local variable area of the callee's stack frame before the address of the type can be provided (for example, using the @@ -5653,31 +5368,31 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> Map fixed-size aggregates and unions passed by value to as many doublewords of the Parameter Save Area as the value uses in - memory. Align aggregates and unions as follows: - - - Aggregates that contain qualified floating-point or vector - arguments are normally aligned at the alignment of their base type. - For more information about qualified arguments, see - . - - - Other aggregates are normally aligned in accordance with the - aggregate's defined alignment. - - - The alignment will never be larger than the stack frame - alignment (16 bytes). + memory. Align aggregates and unions as follows: + + + Aggregates that contain qualified floating-point or vector + arguments are normally aligned at the alignment of their base type. + For more information about qualified arguments, see + . + + + Other aggregates are normally aligned in accordance with the + aggregate's defined alignment. + + + The alignment will never be larger than the stack frame + alignment (16 bytes). + + + This might result in doublewords being skipped for alignment. + When a doubleword in the Parameter Save Area (or its GPR copy) contains + at least a portion of a structure, that doubleword must contain all + other portions mapping to the same doubleword. (That is, a doubleword + can either be completely valid, or completely invalid, but not + partially valid and invalid, except in the last doubleword where + invalid padding may be present.) - - This might result in doublewords being skipped for alignment. - When a doubleword in the Parameter Save Area (or its GPR copy) contains - at least a portion of a structure, that doubleword must contain all - other portions mapping to the same doubleword. (That is, a doubleword - can either be completely valid, or completely invalid, but not - partially valid and invalid, except in the last doubleword where - invalid padding may be present.) - Pad an aggregate or union smaller than one doubleword in size, but having a non-zero size, @@ -5773,24 +5488,24 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> A member of a homogeneous aggregate of multiple like data types passed in up to eight floating-point registers + A homogeneous aggregate can consist of a variety of nested + constructs including structures, unions, and array members, which shall + be traversed to determine the types and number of members of the base + floating-point type. (A complex floating-point data type is treated as if + two separate scalar values of the base type were passed.) + Homogeneous floating-point aggregates can have up to four IBM + EXTENDED PRECISION members, four IEEE BINARY 128 EXTENDED precision + members, four _Decimal128 members, or eight members of other + floating-point types. (Unions are treated as their largest member. For + homogeneous unions, different union alternatives may have different + sizes, provided that all union members are homogeneous with respect to + each other.) They are passed in floating-point registers if parameters of + that type would be passed in floating-point registers. They are passed in + vector registers if parameters of that type would be passed in vector + registers. They are passed as if each member was specified as a separate + parameter. - A homogeneous aggregate can consist of a variety of nested - constructs including structures, unions, and array members, which shall - be traversed to determine the types and number of members of the base - floating-point type. (A complex floating-point data type is treated as if - two separate scalar values of the base type were passed.) - Homogeneous floating-point aggregates can have up to four IBM - EXTENDED PRECISION members, four IEEE BINARY 128 EXTENDED precision - members, four _Decimal128 members, or eight members of other - floating-point types. (Unions are treated as their largest member. For - homogeneous unions, different union alternatives may have different - sizes, provided that all union members are homogeneous with respect to - each other.) They are passed in floating-point registers if parameters of - that type would be passed in floating-point registers. They are passed in - vector registers if parameters of that type would be passed in vector - registers. They are passed as if each member was specified as a separate - parameter. A qualified vector argument corresponds to: @@ -5804,21 +5519,21 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> Any future type requiring 16-byte alignment (see ) or processed in vector registers + For the purpose of determining a qualified floating-point argument, + _Float128 shall be considered a vector data type. In addition, _Float128 + is like a vector data type for determining if multiple aggregate members + are like. + A homogeneous aggregate can consist of a variety of nested + constructs including structures, unions, and array members, which shall + be traversed to determine the types and number of members of the base + vector type. Homogeneous vector aggregates with up to eight members are + passed in up to eight vector registers as if each member was specified as + a separate parameter. (Unions are treated as their largest member. For + homogeneous unions, different union alternatives may have different + sizes, provided that all union members are homogeneous with respect to + each other.) - For the purpose of determining a qualified floating-point argument, - _Float128 shall be considered a vector data type. In addition, _Float128 - is like a vector data type for determining if multiple aggregate members - are like. - A homogeneous aggregate can consist of a variety of nested - constructs including structures, unions, and array members, which shall - be traversed to determine the types and number of members of the base - vector type. Homogeneous vector aggregates with up to eight members are - passed in up to eight vector registers as if each member was specified as - a separate parameter. (Unions are treated as their largest member. For - homogeneous unions, different union alternatives may have different - sizes, provided that all union members are homogeneous with respect to - each other.) Floating-point and vector aggregates that contain padding words and integer fields with a width of 0 should not be treated as @@ -5851,8 +5566,7 @@ xml:id="dbdoclet.50655240_pgfId-1156194"> size of the corresponding in-memory representation of the passed argument's type. The parameter size is always rounded up to the next multiple of a - doubleword. - + doubleword. Consequently, each parameter of a non-zero size is allocated to at least one doubleword. @@ -6373,8 +6087,7 @@ s6 - 72 (stored) architectures that pass some of the arguments in registers. The Power Architecture is one of the architectures that passes some of the arguments in registers. - - The parameter + The parameter list may be zero length and is only allocated when parameters are spilled, when a function has unnamed parameters, or when no prototype is provided. When the Parameter Save Area is allocated, the Parameter Save @@ -6387,11 +6100,10 @@ s6 - 72 (stored) registers as if the return value was the first named input argument to a function unless the return value is a nonhomogeneous aggregate larger than 2 doublewords or a homogeneous aggregate with more than eight - registers. - + registers. For a definition of homogeneous aggregates, see . - (Homogeneous aggregates are arrays, structs, or unions of a + (Homogeneous aggregates are arrays, structs, or unions of a homogeneous floating-point or vector type and of a known fixed size.) Therefore, IBM EXTENDED PRECISION functions are returned in f1:f2. Homogeneous floating-point or vector aggregate return values that @@ -6572,9 +6284,13 @@ lvx v1, 0, r12 and branch instructions that use registers. In both cases, absolute addressing is not required. - - - + + Second, when absolute addressing is required, the value can be + computed with a Global Offset Table (GOT), which holds the information + for address computation. Static and const references can be + accessed using a TOC pointer relative addressing model, while (shared) + extern references must be accessed using the GOT-indirect addressing + scheme. Both addressing schemes require a TOC pointer to be initialized. @@ -6892,23 +6608,20 @@ or r0, r0, r1 .
GPR Save and Restore Functions - Each _savegpr0_ - N routine saves the general registers from r - N- r31, inclusive. Each routine also saves the LR. + Each _savegpr0_N routine saves the general registers from + rN- r31, inclusive. Each routine also saves the LR. The stack frame must not have been allocated yet. When the routine is called, r1 contains the address of the word immediately beyond the end of the general register save area, and r0 must contain the value of the LR on function entry. - The _restgpr0_ - N routines restore the general registers from r - N- r31, and then return to their caller's caller. + The _restgpr0_N routines restore the general registers from + rN- r31, and then return to their caller's caller. The caller's stack frame must already have been deallocated. When the routine is called, r1 contains the address of the word immediately beyond the end of the general register save area, and the LR must contain the return address. - A sample implementation of _savegpr0_ - N and _restgpr0_ - N follows: + A sample implementation of _savegpr0_N and + _restgpr0_N follows: _savegpr0_14: std r14,-144(r1) _savegpr0_15: std r15,-136(r1) _savegpr0_16: std r16,-128(r1) @@ -7007,17 +6720,14 @@ or r0, r0, r1
FPR Save and Restore Functions - Each _savefpr_ - N routine saves the floating-point registers from f - N- f31, inclusive. When the routine is called, r1 + Each _savefpr_N routine saves the floating-point registers from + fN- f31, inclusive. When the routine is called, r1 contains the address of the word immediately beyond the end of the Floating-Point Register Save Area, which means that the stack frame must not have been allocated yet. Register r0 must contain the value of the LR on function entry. - The _restfpr_ - N routines restore the floating-point registers - from f - N- f31, inclusive. When the routine is called, r1 + The _restfpr_N routines restore the floating-point registers + from fN- f31, inclusive. When the routine is called, r1 contains the address of the word immediately beyond the end of the Floating-Point Register Save Area, which means that the stack frame must not have been allocated yet. @@ -7025,9 +6735,8 @@ or r0, r0, r1 same prologue, or _restfpr_M and _restgpr0_M in the same epilogue. It is correct to call _savegpr1_M and _savefpr_M in either order, and to call _restgpr1_M and then _restfpr_M. - A sample implementation of _savefpr_ - N and _restfpr_ - N follows: + A sample implementation of _savefpr_N and + _restfpr_N follows: _savefpr_14: stfd f14,-144(r1) _savefpr_15: stfd f15,-136(r1) _savefpr_16: stfd f16,-128(r1) @@ -7450,15 +7159,14 @@ stw r0,0,(r7) Due to fusion hardware support, the preferred code forms are - destructive - + destructive Destructive in this context refers to a code sequence where the first intermediate result computed by a first instruction is overwritten (that is, "destroyed") by the result of a second instruction so that only one result register is produced. Fusion can then give the same performance as a single load instruction with a 32-bit displacement. - addressing forms with an addis specifying a set of + addressing forms with an addis specifying a set of high-order bits followed immediately by a destructive load using the same target register as the addis instruction to load data from a signed 32-bit offset from a base register. @@ -7520,13 +7228,10 @@ stw r0,0,(r7) Low part of the offset: symbol@l - If the instruction using symbol@got@ - l has a signed immediate operand (for example, - addi), use symbol@got@ - ha(high adjusted) for the high part of the offset. + If the instruction using symbol@got@l has a signed immediate operand (for example, + addi), use symbol@got@ha(high adjusted) for the high part of the offset. If it has an unsigned immediate operand (for example, ori), use - symbol@got@ - h. For a description of high-adjusted values, see + symbol@got@h. For a description of high-adjusted values, see . @@ -7557,7 +7262,16 @@ stw r0,0,(r7) . - +
+ Direct Function Call + + + + + +
+ The called function is not in the same executable or shared @@ -7603,13 +7317,22 @@ nop For indirect function calls, the address of the function to be called is placed in r12 and the CTR register. A bctrl instruction is used - to pereform the indirect branch as shown in + to perform the indirect branch as shown in , and . The ELF V2 ABI requires the address of the called function to be in r12 when a cross-module function call is made. -
+
+ Indirect Function Call (Absolute Medium Model) + + + + + +
+ shows how to make an indirect function call using small-model position-independent code. -
+
+ Small-Model Position-Independent Indirect Function Call + + + + + +
+ shows how to make an indirect function call using large-model position-independent code. -
+
+ Large-Model Position-Independent Indirect Function Call + + + + + +
+ Function calls need to be performed in conjunction with establishing, maintaining, and restoring addressability through the TOC pointer register, r2. When a function is called, the TOC pointer register @@ -7828,7 +7569,16 @@ bl target shows the model for branch instructions. -
+
+ Branch Instruction Model + + + + + +
+ Selecting one of multiple branches is accomplished in C with switch statements. An address table is used by the compiler to implement the switch statement selections in cases where the case labels satisfy @@ -7886,8 +7636,18 @@ b .L01 application) loaded into the low or high address range, absolute addressing of a branch table yields the best performance. -
- Absolute Switch Code (Within)for static modules located in low + <figure> + <title>Absolute Switch Code (Within) for static modules located in low + or high 2 GB of address space + + + + + + + A faster variant of this code may be used to locate branch targets in the bottom 2 GB of the address space in conjunction with the lwz instruction in place of the lwa instruction. -
+ +
+ Absolute Switch Code (Beyond) for static modules beyond the top + or bottom 2 GB of the address space + + + + + +
+ For position-independent code targeted at being dynamically loaded to different address ranges as DSO, the preferred code pattern uses TOC-relative addressing by taking advantage of the fact that the TOC @@ -8014,7 +7785,18 @@ bctr relative offsets from the start address of the branch table ensures position-independence when code is loaded at different addresses. -
+ +
+ Position-Independent Switch Code for Small/Medium Models + (preferred with TOC-relative addressing) + + + + + +
+ For position-independent code targeted at being dynamically loaded to different address ranges as a DSO or a position-independent executable (PIE), the preferred code pattern uses TOC-indirect addresses for code @@ -8079,7 +7861,17 @@ bctr table ensures position independence when code is loaded at different addresses. -
+
+ Position-Independent Switch Code for All Models (alternate, with + GOT-indirect addressing) + + + + + +
+ shows how, in the medium code model, PIC code can be used to avoid using the lwa instruction, which may @@ -8194,7 +7986,7 @@ f1: Using a frame pointer is the recognized method for maintaining addressability to arguments or local variables. (This may be a pointer to the top of the stack frame, typically in r31.) For - correct behavior in the cases of setjmp() and longjmp(), the frame + correct behavior in the cases of setjmp( ) and longjmp( ), the frame pointer shall be allocated in a nonvolatile general-purpose register. @@ -8262,7 +8054,7 @@ addi r3,r1,p ; R3 = new data area following parameter save area.The DWARF specification is used by compilers and debuggers to aid source-level or symbolic debugging. However, the format is not biased toward any particular compiler or debugger. Per the DWARF specification, a - mapping from Power Archtecture regiters to register numbers is required as + mapping from Power Archtecture registers to register numbers is required as described in .All instances of the Power Architecture use the mapping shown in for encoding registers into @@ -8397,8 +8189,7 @@ addi r3,r1,p ; R3 = new data area following parameter save area.cr0 - cr7 - 0.5 - + 0.5 The CRx registers correspond to 4-bit fields within a word where the offset of the 4-bit group within a word is a function of the CRFx number (x). diff --git a/specification/ch_3.xml b/specification/ch_3.xml index a7b0ac0..f173b04 100644 --- a/specification/ch_3.xml +++ b/specification/ch_3.xml @@ -116,8 +116,7 @@ e_ident[EI_DATA] ELFDATA2LSB For all little-endian implementations. - .plt - + .plt The type of the OpenPOWER ABI .plt section is SHT_NOBITS, not SHT_PROGBITS as on most other processors. @@ -184,7 +183,7 @@ e_ident[EI_DATA] ELFDATA2LSB For all little-endian implementations. and + and . @@ -319,7 +318,7 @@ e_ident[EI_DATA] ELFDATA2LSB For all little-endian implementations. - Note: If the function is not a leaf function, it must + If the function is not a leaf function, it must call subroutines using the R_PPC64_REL24_NOTOC relocation to indicate that the TOC register is not initialized. In turn, this may lead to more expensive procedure linkage @@ -852,7 +851,7 @@ my_func: - + 0 @@ -2230,7 +2229,7 @@ my_func: Denotes the high adjusted value: bits 16 - 63 of the - indicated value, compensating for #lo() being treated as a + indicated value, compensating for #lo( ) being treated as a signed number. That is: #ha(x) = (x + 0x8000) >> 16 @@ -2324,7 +2323,7 @@ my_func: If n is the offset computed: GOT[n] = dtpmod GOT[n + 1] = dtprel - The call to __tls_get_addr () happens as: + The call to __tls_get_addr ( ) happens as: __tls_get_addr ((tls_index *) &GOT[n]) @@ -2339,7 +2338,7 @@ my_func: If n is the offset computed: GOT[n] = dtpmod GOT[n + 1] = 0 - The call to __tls_get_addr () happens as: + The call to __tls_get_addr ( ) happens as: __tls_get_addr ((tls_index *) &GOT[n]) @@ -2360,7 +2359,7 @@ my_func: - Note: Relocations flagged with an asterisk(*) will + Relocations flagged with an asterisk(*) will trigger a relocation failure if the value computed does not fit in the field specified. @@ -2426,11 +2425,12 @@ my_func: - Note:Relocation values 8, 9, 12, 13, 18, 23, 32, + + Relocation values 8, 9, 12, 13, 18, 23, 32, and 247 are not used. This is to maintain a correspondence to the relocation values used by the 32-bit PowerPC ELF ABI. - + @@ -3204,7 +3204,7 @@ my_func: half16ds* - + (R + A) >> 2 @@ -4119,9 +4119,7 @@ my_func: resolved through a call to the symbol’s procedure linkage table entry. Additionally, it instructs the link editor to build a procedure linkage table for the executable or shared object if one is not created. - - R_PPC64_COPY + R_PPC64_COPY This relocation type is created by the link editor for dynamic linking. Its offset member refers to a location in a writable segment. The symbol table index specifies a symbol that should exist both in the @@ -4219,7 +4217,7 @@ ld r3,x@got@l(r3) - Note: If X is a variable stored in the TOC, + If X is a variable stored in the TOC, then X@got is the offset within the TOC of a doubleword whose value is X@toc. @@ -4274,23 +4272,23 @@ lwz rt, offset(r2) Compilers and programmers must ensure that r2 is live at the actual data access point associated with extended displacement addressing. - -
- TOC Pointer Usage - To enable linker-based optimizations when global data is accessed, - the TOC pointer needs to be available for dereference at the point of all - uses of values derived from the TOC pointer in conjunction with the @l - operator. This property is used by the linker to optimize TOC pointer - accesses. In addition, all reaching definitions for a TOC-pointer-derived - access must compute the same definition. - In some implementations, non-ABI-compliant code may be processed by - providing additional linker options; for example, linker options - disabling linker optimization. However, this behavior in support of - non-ABI-compliant code is not guaranteed to be portable and supported in - all systems. -   - Compliant example - addis r4, r2, mysym@toc@ha + +
+ TOC Pointer Usage + To enable linker-based optimizations when global data is accessed, + the TOC pointer needs to be available for dereference at the point of all + uses of values derived from the TOC pointer in conjunction with the @l + operator. This property is used by the linker to optimize TOC pointer + accesses. In addition, all reaching definitions for a TOC-pointer-derived + access must compute the same definition. + In some implementations, non-ABI-compliant code may be processed by + providing additional linker options; for example, linker options + disabling linker optimization. However, this behavior in support of + non-ABI-compliant code is not guaranteed to be portable and supported in + all systems. +   + Compliant example + addis r4, r2, mysym@toc@ha b target @@ -4301,9 +4299,9 @@ lwz rt, offset(r2) target: addi r4, r4, mysym@toc@l ... -   - Non-compliant example - li r4, 0 ; #d1 +   + Non-compliant example + li r4, 0 ; #d1 b target ... @@ -4312,14 +4310,16 @@ target: target: addi r4, r4, mysym@toc@l ; incompatible definitions #d1 and #d2 reach this ... +
+
+ Table Jump Sequences + Some linkers may rewrite jump table sequences, as described in + . For example, linkers may + rewrite address references created using GOT-indirect loads and bl+4 + sequences to use TOC-relative address computation. +
-
- Table Jump Sequences - Some linkers may rewrite jump table sequences, as described in - . For example, linkers may - rewrite address references created using GOT-indirect loads and bl+4 - sequences to use TOC-relative address computation. -
+
Fusion Code generation in compilers, linkers, and by programmers should @@ -4426,14 +4426,14 @@ addi r4, r4, lower tlsoffset(m + 1) = round(tlsoffset(m) + tlssize(m), align(m + 1)) - The function round() returns its first argument rounded up to + The function round( ) returns its first argument rounded up to the next multiple of its second argument: round(x, y) = y × ceiling(x / y) - The function ceiling() returns the smallest integer greater + The function ceiling( ) returns the smallest integer greater than or equal to its argument, where n is an integer satisfying: n - 1 < x ≤ n: @@ -4441,9 +4441,9 @@ tlsoffset(m + 1) = round(tlsoffset(m) + tlssize(m), align(m + 1))ceiling(x) = n In the case of dynamic shared objects (DSO), TLS blocks are allocated on an as-needed basis, with the details of allocation - abstracted away by the __tls_get_addr() function, which is used to + abstracted away by the __tls_get_addr( ) function, which is used to retrieve the address of any TLS variable. - The prototype for the __tls_get_addr() function, is defined as + The prototype for the __tls_get_addr( ) function, is defined as follows. typedef struct { @@ -4514,7 +4514,7 @@ extern void *__tls_get_addr (tls_index *ti); code model, which is the default for the ELF V2 ABI. Given the following code fragment, to determine the address of a - thread-local variable x, the __tls_get_addr() function is called with one + thread-local variable x, the __tls_get_addr( ) function is called with one parameter. That parameter is a pointer to a data object of type tls_index. extern __thread unsigned int x; @@ -5717,8 +5717,7 @@ static __thread unsigned int x3; the following code, which makes no reference to GOT entries. The GOT entries in can be removed from the GOT by - the linker when performing this code transformation. - + the linker when performing this code transformation. To further optimize the code in , a linker may reschedule the sequence to exploit fusion by generating a sequence that may be fused @@ -6251,7 +6250,7 @@ nop - 1. The linker may prefer to schedule the addis and + The linker may prefer to schedule the addis and addi to be adjacent to take advantage of fusion as a microarchitecture optimization opportunity. @@ -6703,7 +6702,7 @@ nop The result of performing a relocation for a TLS symbol is the module ID and its offset within the TLS block. These are then stored in the GOT. Later, they are obtained by the dynamic linker at run-time and - passed to __tls_get_addr(), which returns the address for the variable + passed to __tls_get_addr( ), which returns the address for the variable for the current thread. For more information, see . For TLS relocations, see diff --git a/specification/ch_4.xml b/specification/ch_4.xml index e43571e..362658d 100644 --- a/specification/ch_4.xml +++ b/specification/ch_4.xml @@ -141,7 +141,7 @@ xmlns:xl="http://www.w3.org/1999/xlink" version="5.0" xml:lang="en">
- Note: For the PT_LOAD entry describing the data segment, the + For the PT_LOAD entry describing the data segment, the p_memsz may be greater than the p_filesz. The difference is the size of the .bss section. On implementations that use virtual memory file mapping, only the portion of the file between the .data p_offset @@ -152,7 +152,7 @@ xmlns:xl="http://www.w3.org/1999/xlink" version="5.0" xml:lang="en"> data through p_vaddr + p_memsz. - demonstrates a typical mapping of + demonstrates a typical mapping of file to memory segments. Memory Segment Mappings @@ -341,8 +341,8 @@ xmlns:xl="http://www.w3.org/1999/xlink" version="5.0" xml:lang="en"> argument passing. For example, a C program might typically issue the following declaration to begin executing at the local entry point of a function named main: - extern int main (int argc, char *argv[], char *envp[], void *auxv[]); -int main(int argc, char *argv[], char *envp[], ElfW(auxv_t) *auxvec) + extern int main (int argc, char *argv[ ], char *envp[ ], void *auxv[ ]); +int main(int argc, char *argv[ ], char *envp[ ], ElfW(auxv_t) *auxvec)where: @@ -366,192 +366,190 @@ int main(int argc, char *argv[], char *envp[], ElfW(auxv_t) *auxvec) This section explains how to implement the call to main or to the entry point. - -
- Registers - <anchor xml:id="dbdoclet.50655242_PROC-REG" - xreflabel="" /> Registers - The contents of most registers are - not specified when a process is first entered from an - exec system call. A program should not expect the operating system to set - all registers to 0. If a register other than those listed in - must have a specific value, the - program must set it to that value during process initialization. - The contents of the following registers - are specified: - -
- Registers Specified during Process Initialization - - - - - - - - Register - - - - - Description - - - - - - - - r1 - - - The initial stack pointer, aligned to a quadword - boundary. - - - - - r2 - - - Undefined. - - - - - r3 - - - Contains argc, the nonnegative argument count. - - - - - r4 - - - Contains argv, a pointer to the array of argument - pointers in the stack. The array is immediately followed by a - NULL pointer. If there are no arguments, r4 points to a NULL - pointer. - - - - - r5 - - - Contains envp, a pointer to the array of environment - pointers in the stack. The array is immediately followed by a - NULL pointer. If no environment exists, r5 points to a NULL - pointer. - - - - - r6 - - - Contains a pointer to the auxiliary vector. The auxiliary - vector shall have at least one member, a terminating entry with - an a_type of AT_NULL (see - ). - - - - - r7 - - - Contains a termination function pointer. If r7 contains a - nonzero value, the value represents a function pointer that the - application should register with atexit. If r7 contains zero, - no action is required. - - - - - r12 - - - Contains the address of the global entry point of the - first function being invoked, which represents the start - address of the executable specified in the exec call. - - - - - FPSCR - - - Contains 0, specifying “round to nearest” mode for both - binary and decimal rounding modes, IEEE Mode, and the disabling - of floating-point exceptions. - - - - - VSCR - - - Vector Status and Control Register. Contains 0, - specifying vector Java/IEEE mode and that no saturation has - occurred. - - - - -
- The run-time that gets control from _start is responsible for: - - - Creating the first stack frame - - - Initializing the first stack frame's back chain pointer to - NULL - - - Allocating and initializing TLS storage - - - Initializing the thread control block (TCB) and dynamic thread - vector (DTV) - - - Initializing any __thread variables - - - Setting R13 for the initial process thread. - - - This initialization must be completed before any library - initialization codes are run and before control is transferred to the - main program (main()). -
-
- Process Stack - Although every process has a stack, no fixed stack address is - defined by the system. In addition, a program's stack address can change - from one system to another. It can even change from one process - invocation to another. Thus, the process initialization code must use the - stack address in general-purpose register r1. Data in the stack segment - at addresses below the stack pointer contain undefined values. -
-
- Auxiliary Vector - The argument and environment vectors transmit information from one - application program to another. However, the auxiliary vector conveys - information from the operating system to the program. This vector is an - array of structures, defined as follows: - typedef struct + +
+ Registers + The contents of most registers are + not specified when a process is first entered from an + exec system call. A program should not expect the operating system to set + all registers to 0. If a register other than those listed in + must have a specific value, the + program must set it to that value during process initialization. + The contents of the following registers + are specified: + + + Registers Specified during Process Initialization + + + + + + + + Register + + + + + Description + + + + + + + + r1 + + + The initial stack pointer, aligned to a quadword + boundary. + + + + + r2 + + + Undefined. + + + + + r3 + + + Contains argc, the nonnegative argument count. + + + + + r4 + + + Contains argv, a pointer to the array of argument + pointers in the stack. The array is immediately followed by a + NULL pointer. If there are no arguments, r4 points to a NULL + pointer. + + + + + r5 + + + Contains envp, a pointer to the array of environment + pointers in the stack. The array is immediately followed by a + NULL pointer. If no environment exists, r5 points to a NULL + pointer. + + + + + r6 + + + Contains a pointer to the auxiliary vector. The auxiliary + vector shall have at least one member, a terminating entry with + an a_type of AT_NULL (see + ). + + + + + r7 + + + Contains a termination function pointer. If r7 contains a + nonzero value, the value represents a function pointer that the + application should register with atexit. If r7 contains zero, + no action is required. + + + + + r12 + + + Contains the address of the global entry point of the + first function being invoked, which represents the start + address of the executable specified in the exec call. + + + + + FPSCR + + + Contains 0, specifying “round to nearest” mode for both + binary and decimal rounding modes, IEEE Mode, and the disabling + of floating-point exceptions. + + + + + VSCR + + + Vector Status and Control Register. Contains 0, + specifying vector Java/IEEE mode and that no saturation has + occurred. + + + + +
+ The run-time that gets control from _start is responsible for: + + + Creating the first stack frame + + + Initializing the first stack frame's back chain pointer to + NULL + + + Allocating and initializing TLS storage + + + Initializing the thread control block (TCB) and dynamic thread + vector (DTV) + + + Initializing any __thread variables + + + Setting R13 for the initial process thread. + + + This initialization must be completed before any library + initialization codes are run and before control is transferred to the + main program (main( )). +
+
+ Process Stack + Although every process has a stack, no fixed stack address is + defined by the system. In addition, a program's stack address can change + from one system to another. It can even change from one process + invocation to another. Thus, the process initialization code must use the + stack address in general-purpose register r1. Data in the stack segment + at addresses below the stack pointer contain undefined values. +
+
+ Auxiliary Vector + The argument and environment vectors transmit information from one + application program to another. However, the auxiliary vector conveys + information from the operating system to the program. This vector is an + array of structures, defined as follows: + typedef struct { long a_type; union { long a_val; void *a_ptr; - void (*a_fcn)(); + void (*a_fcn)( ); } a_un; } auxv_t; @@ -571,7 +569,7 @@ AT_EGID 14 /* Effective group ID (egid) */ AT_PLATFORM 15 a_ptr /* String identifying platform. */ AT_HWCAP 16 a_val /* Machine-dependent hints about processor capabilities. */ -AT_CLKTCK 17 /* Frequency of times(), always 100 */ +AT_CLKTCK 17 /* Frequency of times( ), always 100 */ AT_DCACHEBSIZE 19 a_val /* Data cache block size */ AT_ICACHEBSIZE 20 a_val /* Instruction cache block size */ AT_UCACHEBSIZE 21 a_val /* Unified cache block size */ @@ -591,72 +589,72 @@ AT_SYSINFO_EHDR 33 /* In many architectures, the kernel VDSO header that is used by the dynamic linker to resolve function symbols with the VDSO. */ - AT_NULL - The auxiliary vector has no fixed length; instead an entry of this - type denotes the end of the vector. The corresponding value of a_un is - undefined. - AT_PHDR - Under some conditions, the system creates the memory image of the - application program before passing control to an interpreter program. - When this happens, the a_ptr member of the AT_PHDR entry tells the - interpreter where to find the program header table in the memory image. - If the AT_PHDR entry is present, entries of types AT_PHENT, AT_PHNUM, and - AT_ENTRY must also be present. See the Program Header section in Chapter - 5 of the - System V ABI for more information about the program - header table. - AT_PHENT - The a_val member of this entry holds the size, in bytes, of one - entry in the program header table to which the AT_PHDR entry - points. - AT_PHNUM - The a_val member of this entry holds the number of entries in the - program header table to which the AT_PHDR entry points. - AT_PAGESZ - If present, this entry's a_val member gives the system page size in - bytes. The same information is also available through the sysconf system - call. - AT_BASE - The a_ptr member of this entry holds the base address at which the - interpreter program was loaded into memory. See the Program Header - section in Chapter 5 of the - System V ABI for more information about the base - address. - AT_FLAGS - If present, the a_val member of this entry holds 1-bit flags. Bits - with undefined semantics are set to zero. Other auxiliary vector types - are reserved. No flags are currently defined for AT_FLAGS on the 64-bit - OpenPOWER ABI Architecture. - AT_ENTRY - The a_ptr member of this entry holds the entry point of the - application program to which the interpreter program should transfer - control. - AT_DCACHEBSIZE - The a_val member of this entry gives the data cache block size for - processors on the system on which this program is running. If the - processors have unified caches, AT_DCACHEBSIZE is the same as - AT_UCACHEBSIZE. - AT_ICACHEBSIZE - The a_val member of this entry gives the instruction cache block - size for processors on the system on which this program is running. If - the processors have unified caches, AT_ICACHEBSIZE is the same as - AT_UCACHEBSIZE. - AT_UCACHEBSIZE - The a_val member of this entry is zero if the processors on the - system on which this program is running do not have a unified instruction - and data cache. Otherwise, it gives the cache block size. - AT_PLATFORM - The a_ptr member is the address of the platform name string. For - virtualized systems, this may be different (that is, an older platform) - than the physical machine running this environment. - AT_BASE_PLATFORM - The a_ptr member is the address of the platform name string for the - physical machine. For virtualized systems, this will be the platform name - of the real hardware. - AT_HWCAP - The a_val member of this entry is a bit map of hardware - capabilities. Some bit mask values include: - PPC_FEATURE_32 0x80000000 /* Always set for powerpc64 */ + AT_NULL + The auxiliary vector has no fixed length; instead an entry of this + type denotes the end of the vector. The corresponding value of a_un is + undefined. + AT_PHDR + Under some conditions, the system creates the memory image of the + application program before passing control to an interpreter program. + When this happens, the a_ptr member of the AT_PHDR entry tells the + interpreter where to find the program header table in the memory image. + If the AT_PHDR entry is present, entries of types AT_PHENT, AT_PHNUM, and + AT_ENTRY must also be present. See the Program Header section in Chapter + 5 of the + System V ABI for more information about the program + header table. + AT_PHENT + The a_val member of this entry holds the size, in bytes, of one + entry in the program header table to which the AT_PHDR entry + points. + AT_PHNUM + The a_val member of this entry holds the number of entries in the + program header table to which the AT_PHDR entry points. + AT_PAGESZ + If present, this entry's a_val member gives the system page size in + bytes. The same information is also available through the sysconf system + call. + AT_BASE + The a_ptr member of this entry holds the base address at which the + interpreter program was loaded into memory. See the Program Header + section in Chapter 5 of the + System V ABI for more information about the base + address. + AT_FLAGS + If present, the a_val member of this entry holds 1-bit flags. Bits + with undefined semantics are set to zero. Other auxiliary vector types + are reserved. No flags are currently defined for AT_FLAGS on the 64-bit + OpenPOWER ABI Architecture. + AT_ENTRY + The a_ptr member of this entry holds the entry point of the + application program to which the interpreter program should transfer + control. + AT_DCACHEBSIZE + The a_val member of this entry gives the data cache block size for + processors on the system on which this program is running. If the + processors have unified caches, AT_DCACHEBSIZE is the same as + AT_UCACHEBSIZE. + AT_ICACHEBSIZE + The a_val member of this entry gives the instruction cache block + size for processors on the system on which this program is running. If + the processors have unified caches, AT_ICACHEBSIZE is the same as + AT_UCACHEBSIZE. + AT_UCACHEBSIZE + The a_val member of this entry is zero if the processors on the + system on which this program is running do not have a unified instruction + and data cache. Otherwise, it gives the cache block size. + AT_PLATFORM + The a_ptr member is the address of the platform name string. For + virtualized systems, this may be different (that is, an older platform) + than the physical machine running this environment. + AT_BASE_PLATFORM + The a_ptr member is the address of the platform name string for the + physical machine. For virtualized systems, this will be the platform name + of the real hardware. + AT_HWCAP + The a_val member of this entry is a bit map of hardware + capabilities. Some bit mask values include: + PPC_FEATURE_32 0x80000000 /* Always set for powerpc64 */ PPC_FEATURE_64 0x40000000 /* Always set for powerpc64 */ PPC_FEATURE_HAS_ALTIVEC 0x10000000 PPC_FEATURE_HAS_FPU 0x08000000 @@ -690,15 +688,17 @@ PPC_FEATURE2_HAS_ISEL 0x08000000 /* Integer Select */ PPC_FEATURE2_HAS_TAR 0x04000000 /* Target Address Register */ PPC_FEATURE2_HAS_VCRYPTO 0x02000000 /* The processor implements the Vector.AES category */ - When a process starts to execute, its stack holds the arguments, - environment, and auxiliary vector received from the exec call. The system - makes no guarantees about the relative arrangement of argument strings, - environment strings, and the auxiliary information, which appear in no - defined or predictable order. Further, the system may allocate memory - after the null auxiliary vector entry and before the beginning of the - information block. + When a process starts to execute, its stack holds the arguments, + environment, and auxiliary vector received from the exec call. The system + makes no guarantees about the relative arrangement of argument strings, + environment strings, and the auxiliary information, which appear in no + defined or predictable order. Further, the system may allocate memory + after the null auxiliary vector entry and before the beginning of the + information block. +
+
Dynamic Linking
@@ -709,8 +709,7 @@ PPC_FEATURE2_HAS_VCRYPTO 0x02000000 /* The processor implements the
Dynamic Section - - The dynamic + The dynamic section provides information used by the dynamic linker to manage dynamically loaded shared objects, including relocation, initialization, and termination when loaded or unloaded, resolving dependencies on other @@ -877,189 +876,190 @@ PPC_FEATURE2_HAS_VCRYPTO 0x02000000 /* The processor implements the stored in the file image. The individual PLT entries are populated by the dynamic linker using one of the following binding methods. Execution can then be redirected to a dependent shared object or executable. -
-
- Lazy Binding - The lazy binding method is the default. It delays the resolution of - a PLT entry to an absolute address until the function call is made the - first time. The benefit of this method is that the application does not - pay the resolution cost until the first time it needs to call the - function, if at all. - To implement lazy binding, the dynamic loader points each PLT entry - to a lazy resolution stub at load time. After the function call is made - the first time, this lazy resolution stub gets control, resolves the - symbol, and updates the PLT entry to hold the final value to be used for - future calls. -
-
- Immediate Binding - The immediate binding method resolves the absolute addresses of all - PLT entries in the executable and dependent shared objects at load time, - before passing execution control to the application. The environment - variable LD_BIND_NOW may be set to a nonnull value to signal the dynamic - linker that immediate binding is requested at load time, before control - is given to the application. - For some performance-sensitive situations, it may be better to pay - the resolution cost to populate the PLT entries up front rather than - during execution. -
-
- Procedure Linkage Table - For every call site that needs to use the PLT, the link editor - constructs a call stub in the .text section and resolves the call site to - use that call stub. The call stub transfers control to the address - indicated in the PLT entry. These call stubs need not be adjacent to one - another or unique. They can be scattered throughout the text segment so - that they can be reached with a branch and link instruction. - Depending on relocation information at the call site, the stub - provides one of the following properties: - - - The caller has set up r2 to hold the TOC pointer and expects - the PLT call stub to save that value to the TOC save stack slot. This - is the default. - - - The caller has set up r2 to hold the TOC pointer and has - already saved that value to the TOC save stack slot itself. This is - indicated by the presence of a R_PPC64_TOCSAVE relocation on the nop - following the call. - - - tocsaveloc: - nop - ... -bl target - .reloc ., R_PPC64_TOCSAVE, tocsaveloc - nop - - - 3. The caller has not set up r2 to hold the TOC pointer. This - is indicated by use of a R_PPC64_REL24_NOTOC relocation (instead of - R_PPC64_REL24) on the call instruction. - - - In any scenario, the PLT call stub must transfer control to the - function whose address is provided in the associated PLT entry. This - address is treated as a global entry point for ABI purposes. This means - that the PLT call stub loads the address into r12 before transferring - control. - Although the details of the call stub implementation are left to - the link editor, some examples are provided. In those examples, func@plt - is used to denote the address of the PLT entry for func; func@plt@toc - denotes the offset of that address relative to the TOC pointer; and the - @ha and @l variants denote the high-adjusted and low parts of these - values as usual. Because the link editor synthesizes the PLT call stubs - directly, it can determine all these values as immediate constants. The - assembler is not required to support those notations. - A possible implementation for case 1 looks as follows (if - func@plt@toc is less than 32 KB, the call stub may be simplified to omit - the addis): - std r2,24(r1) -addis r12,r2,func@plt@toc@ha -ld r12,func@plt@toc@l(r12) -mtctr r12 -bctr - For case 2, the same implementation as for case 1 may be used, - except that the first instruction “std r2,24(r1)” is omitted: - addis r12,r2,func@plt@toc@ha -ld r12,func@plt@toc@l(r12) -mtctr r12 -bctr - - A possible implementation for case 3 looks as - follows: - mflr r0 - bcl 20,31,1f -1: mflr r2 - mtlr r0 - addis r2,r2,(.TOC.-1b)@ha - addi r2,r2,(.TOC.-1b)@l - addis r12,r2,func@plt@toc@ha - ld r12,func@plt@toc@l(r12) - mtctr r12 - bctr - When generating non-PIC code for the small or medium code model, a - simpler variant may alternatively be used for cases 2 or 3: - lis r12,func@plt@ha -ld r12,func@plt@l(r12) -mtctr r12 -bctr - To support lazy binding, the link editor also provides a set of - symbol resolver stubs, one for each PLT entry. Each resolver stub - consists of a single instruction, which is usually a branch to a common - resolver entry point or a nop. The resolver stubs are placed in the - .glink section, which is merged into the .text section of the final - executable or dynamic object. The address of the resolver stubs is - communicated to the dynamic loader through the DT_PPC64_GLINK dynamic - section entry. The address of the symbol resolver stub associated with - PLT entry N is determined by adding 4xN + 32 to the d_ptr field of the - DT_PPC64_GLINK entry. When using lazy binding, the dynamic linker - initializes each PLT entry at load time to that address. - The resolver stubs provided by the link editor must call into the - main resolver routine provided by the dynamic linker. This resolver - routine must be called with r0 set to the index of the PLT entry to be - resolved, r11 set to the identifier of the current dynamic object, and - r12 set to the resolver entry point address (as usual when calling a - global entry point). The resolver entry point address and the dynamic - object identifier are installed at load time by the dynamic linker into - the two doublewords immediately preceding the array of PLT entries, - allowing the resolver stubs to retrieve these values from there. These - two doublewords are considered part of the .plt section; the DT_PLTGOT - dynamic section entry points to the first of those words. - Beyond the above requirements, the implementation of the .glink - resolver stubs is up to the link editor. The following shows an example - implementation: - # ABI note: At entry to the resolver stub: - # - r12 holds the address of the res_N stub for the target routine - # - all argument registers hold arguments for the target routine -PLTresolve: - # Determine addressability. This sequence works for both PIC - # and non-PIC code and does not rely on presence of the TOC pointer. - mflr r0 - bcl 20,31,1f -1: mflr r11 - mtlr r0 - # Compute .plt section index from entry point address in r12 - # .plt section index is placed into r0 as argument to the resolver - sub r0,r12,r11 - subi r0,r0,res_0-1b - srdi r0,r0,2 - # Load address of the first byte of the PLT - ld r12,PLToffset-1b(r11) - add r11,r12,r11 - # Load resolver address and DSO identifier from the - # first two doublewords of the PLT - ld r12,0(r11) - ld r11,8(r11) - # Branch to resolver - mtctr r12 - bctr - # ABI note: At entry to the resolver: - # - r12 holds the resolver address - # - r11 holds the DSO identifier - # - r0 holds the PLT index of the target routine - # - all argument registers hold arguments for the target routine - - # Constant pool holding offset to the PLT - # Note that there is no actual symbol PLT; the link editor - # synthesizes this value when creating the .glink section -PLToffset: - .quad PLT-. - - # A table of branches, one for each PLT entry - # The idea is that the PLT call stub loads r12 with these - # addresses, so (r12 - res_0) gives the PLT index × 4. - -res_0: b PLTresolve -res_1: b PLTresolve - ... - After resolution, the value of a PLT entry in the PLT is the - address of the function’s global entry point, unless the resolver can - determine that a module-local call occurs with a shared TOC value wherein - the TOC is shared between the caller and the callee. - + +
+ Lazy Binding + The lazy binding method is the default. It delays the resolution of + a PLT entry to an absolute address until the function call is made the + first time. The benefit of this method is that the application does not + pay the resolution cost until the first time it needs to call the + function, if at all. + To implement lazy binding, the dynamic loader points each PLT entry + to a lazy resolution stub at load time. After the function call is made + the first time, this lazy resolution stub gets control, resolves the + symbol, and updates the PLT entry to hold the final value to be used for + future calls. +
+
+ Immediate Binding + The immediate binding method resolves the absolute addresses of all + PLT entries in the executable and dependent shared objects at load time, + before passing execution control to the application. The environment + variable LD_BIND_NOW may be set to a nonnull value to signal the dynamic + linker that immediate binding is requested at load time, before control + is given to the application. + For some performance-sensitive situations, it may be better to pay + the resolution cost to populate the PLT entries up front rather than + during execution. +
+
+ Procedure Linkage Table + For every call site that needs to use the PLT, the link editor + constructs a call stub in the .text section and resolves the call site to + use that call stub. The call stub transfers control to the address + indicated in the PLT entry. These call stubs need not be adjacent to one + another or unique. They can be scattered throughout the text segment so + that they can be reached with a branch and link instruction. + Depending on relocation information at the call site, the stub + provides one of the following properties: + + + The caller has set up r2 to hold the TOC pointer and expects + the PLT call stub to save that value to the TOC save stack slot. This + is the default. + + + The caller has set up r2 to hold the TOC pointer and has + already saved that value to the TOC save stack slot itself. This is + indicated by the presence of a R_PPC64_TOCSAVE relocation on the nop + following the call. + + + tocsaveloc: + nop + ... + bl target + .reloc ., R_PPC64_TOCSAVE, tocsaveloc + nop + + + 3. The caller has not set up r2 to hold the TOC pointer. This + is indicated by use of a R_PPC64_REL24_NOTOC relocation (instead of + R_PPC64_REL24) on the call instruction. + + + In any scenario, the PLT call stub must transfer control to the + function whose address is provided in the associated PLT entry. This + address is treated as a global entry point for ABI purposes. This means + that the PLT call stub loads the address into r12 before transferring + control. + Although the details of the call stub implementation are left to + the link editor, some examples are provided. In those examples, func@plt + is used to denote the address of the PLT entry for func; func@plt@toc + denotes the offset of that address relative to the TOC pointer; and the + @ha and @l variants denote the high-adjusted and low parts of these + values as usual. Because the link editor synthesizes the PLT call stubs + directly, it can determine all these values as immediate constants. The + assembler is not required to support those notations. + A possible implementation for case 1 looks as follows (if + func@plt@toc is less than 32 KB, the call stub may be simplified to omit + the addis): + std r2,24(r1) + addis r12,r2,func@plt@toc@ha + ld r12,func@plt@toc@l(r12) + mtctr r12 + bctr + For case 2, the same implementation as for case 1 may be used, + except that the first instruction “std r2,24(r1)” is omitted: + addis r12,r2,func@plt@toc@ha + ld r12,func@plt@toc@l(r12) + mtctr r12 + bctr + A possible implementation for case 3 looks as + follows: + mflr r0 + bcl 20,31,1f + 1: mflr r2 + mtlr r0 + addis r2,r2,(.TOC.-1b)@ha + addi r2,r2,(.TOC.-1b)@l + addis r12,r2,func@plt@toc@ha + ld r12,func@plt@toc@l(r12) + mtctr r12 + bctr + When generating non-PIC code for the small or medium code model, a + simpler variant may alternatively be used for cases 2 or 3: + lis r12,func@plt@ha + ld r12,func@plt@l(r12) + mtctr r12 + bctr + To support lazy binding, the link editor also provides a set of + symbol resolver stubs, one for each PLT entry. Each resolver stub + consists of a single instruction, which is usually a branch to a common + resolver entry point or a nop. The resolver stubs are placed in the + .glink section, which is merged into the .text section of the final + executable or dynamic object. The address of the resolver stubs is + communicated to the dynamic loader through the DT_PPC64_GLINK dynamic + section entry. The address of the symbol resolver stub associated with + PLT entry N is determined by adding 4xN + 32 to the d_ptr field of the + DT_PPC64_GLINK entry. When using lazy binding, the dynamic linker + initializes each PLT entry at load time to that address. + The resolver stubs provided by the link editor must call into the + main resolver routine provided by the dynamic linker. This resolver + routine must be called with r0 set to the index of the PLT entry to be + resolved, r11 set to the identifier of the current dynamic object, and + r12 set to the resolver entry point address (as usual when calling a + global entry point). The resolver entry point address and the dynamic + object identifier are installed at load time by the dynamic linker into + the two doublewords immediately preceding the array of PLT entries, + allowing the resolver stubs to retrieve these values from there. These + two doublewords are considered part of the .plt section; the DT_PLTGOT + dynamic section entry points to the first of those words. + Beyond the above requirements, the implementation of the .glink + resolver stubs is up to the link editor. The following shows an example + implementation: + # ABI note: At entry to the resolver stub: + # - r12 holds the address of the res_N stub for the target routine + # - all argument registers hold arguments for the target routine + PLTresolve: + # Determine addressability. This sequence works for both PIC + # and non-PIC code and does not rely on presence of the TOC pointer. + mflr r0 + bcl 20,31,1f + 1: mflr r11 + mtlr r0 + # Compute .plt section index from entry point address in r12 + # .plt section index is placed into r0 as argument to the resolver + sub r0,r12,r11 + subi r0,r0,res_0-1b + srdi r0,r0,2 + # Load address of the first byte of the PLT + ld r12,PLToffset-1b(r11) + add r11,r12,r11 + # Load resolver address and DSO identifier from the + # first two doublewords of the PLT + ld r12,0(r11) + ld r11,8(r11) + # Branch to resolver + mtctr r12 + bctr + # ABI note: At entry to the resolver: + # - r12 holds the resolver address + # - r11 holds the DSO identifier + # - r0 holds the PLT index of the target routine + # - all argument registers hold arguments for the target routine + + # Constant pool holding offset to the PLT + # Note that there is no actual symbol PLT; the link editor + # synthesizes this value when creating the .glink section + PLToffset: + .quad PLT-. + + # A table of branches, one for each PLT entry + # The idea is that the PLT call stub loads r12 with these + # addresses, so (r12 - res_0) gives the PLT index × 4. + + res_0: b PLTresolve + res_1: b PLTresolve + ... + After resolution, the value of a PLT entry in the PLT is the + address of the function’s global entry point, unless the resolver can + determine that a module-local call occurs with a shared TOC value wherein + the TOC is shared between the caller and the callee. + +
+ diff --git a/specification/ch_5.xml b/specification/ch_5.xml index 4de0834..637191d 100644 --- a/specification/ch_5.xml +++ b/specification/ch_5.xml @@ -17,7 +17,7 @@ xml:id="dbdoclet.50655243_pgfId-1099317">
Malloc Routine Return Pointer Alignment - The malloc() routine must always return a pointer with the + The malloc( ) routine must always return a pointer with the alignment of the largest alignment needed for loads and stores of the built-in data types. This is currently 16 bytes.
@@ -195,8 +195,7 @@ xml:id="dbdoclet.50655243_pgfId-1099317"> __PPC64__ __powerpc64__ - __64BIT__ - + __64BIT__ Phased in. diff --git a/specification/ch_6.xml b/specification/ch_6.xml index f3bdabc..b84bf6a 100644 --- a/specification/ch_6.xml +++ b/specification/ch_6.xml @@ -60,7 +60,7 @@ xml:id="dbdoclet.50655244_pgfId-1095944"> alignment.
The preferred way to access vectors at an application-defined address is by using vector pointers and the C/C++ dereference operator *. Similar - to other C /C++ data types, the array reference operator [] may be used to + to other C /C++ data types, the array reference operator [ ] may be used to access vector objects with a vector pointer with the usual definition to access the n-th vector element from a vector pointer. The use of vector built-in functions such as vec_xl and vec_xst is discouraged except for @@ -136,7 +136,7 @@ register vector double vd = vec_splats(*double_ptr); layout and vector element ordering in big-endian environments shall be big endian, and the default vector layout and vector element ordering in little-endian environments shall be little endian. - This element numbering shall also be used by the [] accessor method + This element numbering shall also be used by the [ ] accessor method to vector elements provided as an extension of the C/C++ languages by some compilers, as well as for other language extensions or library constructs that directly or indirectly refer to elements by their element @@ -203,9 +203,9 @@ register vector double vd = vec_splats(*double_ptr); Endian-Sensitive Operations - - - + + + @@ -274,8 +274,7 @@ register vector double vd = vec_splats(*double_ptr); - vec_extract_fp32_ - from_shorth + vec_extract_fp32_from_shorth @@ -286,8 +285,7 @@ register vector double vd = vec_splats(*double_ptr); - vec_extract_fp32_ - from_shortl + vec_extract_fp32_from_shortl @@ -310,8 +308,7 @@ register vector double vd = vec_splats(*double_ptr); - vec_first_match - _index + vec_first_match_index @@ -322,8 +319,7 @@ register vector double vd = vec_splats(*double_ptr); - vec_first_match - _index_or_eos + vec_first_match_index_or_eos @@ -364,8 +360,7 @@ register vector double vd = vec_splats(*double_ptr); vmrgew - Swap inputs and use vmrgow for LE. Phased in. - + Swap inputs and use vmrgow for LE. Phased in. This optional function is being phased in, and it may not be available on all implementations. @@ -401,8 +396,7 @@ register vector double vd = vec_splats(*double_ptr); vmrgow - Swap inputs and use vmrgew for LE. Phased in. - + Swap inputs and use vmrgew for LE. Phased in. @@ -754,8 +748,7 @@ register vector double vd = vec_splats(*double_ptr); - vec_xlw4 - + vec_xlw4 Deprecated. The use of vector data type assignment and overloaded vec_xl and vec_xst vector built-in functions are preferred forms for assigning @@ -774,8 +767,7 @@ register vector double vd = vec_splats(*double_ptr); - vec_xld2 - + vec_xld2 @@ -798,8 +790,7 @@ register vector double vd = vec_splats(*double_ptr); - vec_xstw4 - + vec_xstw4 @@ -811,8 +802,7 @@ register vector double vd = vec_splats(*double_ptr); - vec_xstd2 - + vec_xstd2 @@ -1173,8 +1163,7 @@ register vector double vd = vec_splats(*double_ptr); - vec_xlw4 - + vec_xlw4 Deprecated. The use of vector data type assignment and overloaded vec_xl and vec_xst vector built-in functions are preferred forms for assigning @@ -1193,8 +1182,7 @@ register vector double vd = vec_splats(*double_ptr); - vec_xld2 - + vec_xld2 @@ -1219,8 +1207,7 @@ register vector double vd = vec_splats(*double_ptr); - vec_xstw4 - + vec_xstw4 @@ -1232,8 +1219,7 @@ register vector double vd = vec_splats(*double_ptr); - vec_xstd2 - + vec_xstd2 @@ -1295,6 +1281,53 @@ register vector double vd = vec_splats(*double_ptr); + + + VEC_CONCAT (ARG1, ARG2)(Fortran) + + + + Purpose: + Concatenates two elements to form a vector. + Result value: + The resulting vector consists of the two scalar elements, + ARG1 and ARG2, assigned to elements 0 and 1 (using the + environment’s native endian numbering), respectively. + + + Note: This function corresponds to the C/C++ vector + constructor (vector type){a,b}. It is provided only for + languages without vector constructors. + + + + + + + + + + vector signed long long vec_concat (signed long long, + signed long long); + + + + + + + + vector unsigned long long vec_concat (unsigned long long, + unsigned long long); + + + + + + + + vector double vec_concat (double, double); + + VEC_CONVERT(V, MOLD)