xiangzhai · September 25, 2018 06:11
diff --git a/Backport-reorganize-the-loongson-march-and-extensions-instructions-set-to-gcc-8.1.0.patch b/Backport-reorganize-the-loongson-march-and-extensions-instructions-set-to-gcc-8.1.0.patch
 Author: Leslie Zhai <[email protected]>
 Date:   Tue Sep 25 14:08:58 2018 +0800

    Backport reorganize the loongson march and extensions instructions set to gcc-8.1.0

 diff --git a/gcc/config.gcc b/gcc/config.gcc
 index a5defb0..067fdad 100644
 --- a/gcc/config.gcc
 +++ b/gcc/config.gcc
 @@ -439,7 +439,7 @@ microblaze*-*-*)
         ;;
 mips*-*-*)
 	cpu_type=mips
 -	extra_headers="loongson.h msa.h"
 +	extra_headers="loongson.h loongson-mmiintrin.h msa.h"
 	extra_objs="frame-header-opt.o"
 	extra_options="${extra_options} g.opt fused-madd.opt mips/mips-tables.opt"
 	;;
 diff --git a/gcc/config/mips/gs264e.md b/gcc/config/mips/gs264e.md
 new file mode 100644
 index 0000000..9b30bb5
 --- /dev/null
 +++ b/gcc/config/mips/gs264e.md
 @@ -0,0 +1,133 @@
 +;; Pipeline model for Loongson gs264e cores.
 +
 +;; Copyright (C) 2011-2018 Free Software Foundation, Inc.
 +;;
 +;; This file is part of GCC.
 +;;
 +;; GCC is free software; you can redistribute it and/or modify it
 +;; under the terms of the GNU General Public License as published
 +;; by the Free Software Foundation; either version 3, or (at your
 +;; option) any later version.
 +;;
 +;; GCC is distributed in the hope that it will be useful, but WITHOUT
 +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 +;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
 +;; License for more details.
 +;;
 +;; You should have received a copy of the GNU General Public License
 +;; along with GCC; see the file COPYING3.  If not see
 +;; <http://www.gnu.org/licenses/>.
 +
 +;; Uncomment the following line to output automata for debugging.
 +;; (automata_option "v")
 +
 +;; Automaton for integer instructions.
 +(define_automaton "gs264e_a_alu")
 +
 +;; Automaton for floating-point instructions.
 +(define_automaton "gs264e_a_falu")
 +
 +;; Automaton for memory operations.
 +(define_automaton "gs264e_a_mem")
 +
 +;; Describe the resources.
 +
 +(define_cpu_unit "gs264e_alu1" "gs264e_a_alu")
 +(define_cpu_unit "gs264e_mem1" "gs264e_a_mem")
 +(define_cpu_unit "gs264e_falu1" "gs264e_a_falu")
 +
 +;; Describe instruction reservations.
 +
 +(define_insn_reservation "gs264e_arith" 1
 +  (and (eq_attr "cpu" "gs264e")
 +       (eq_attr "type" "arith,clz,const,logical,
 +			move,nop,shift,signext,slt"))
 +  "gs264e_alu1")
 +
 +(define_insn_reservation "gs264e_branch" 1
 +  (and (eq_attr "cpu" "gs264e")
 +       (eq_attr "type" "branch,jump,call,condmove,trap"))
 +  "gs264e_alu1")
 +
 +(define_insn_reservation "gs264e_mfhilo" 1
 +  (and (eq_attr "cpu" "gs264e")
 +       (eq_attr "type" "mfhi,mflo,mthi,mtlo"))
 +  "gs264e_alu1")
 +
 +;; Operation imul3nc is fully pipelined.
 +(define_insn_reservation "gs264e_imul3nc" 7
 +  (and (eq_attr "cpu" "gs264e")
 +       (eq_attr "type" "imul3nc"))
 +  "gs264e_alu1")
 +
 +(define_insn_reservation "gs264e_imul" 7
 +  (and (eq_attr "cpu" "gs264e")
 +       (eq_attr "type" "imul,imadd"))
 +  "gs264e_alu1")
 +
 +(define_insn_reservation "gs264e_idiv_si" 12
 +  (and (eq_attr "cpu" "gs264e")
 +       (and (eq_attr "type" "idiv")
 +	    (eq_attr "mode" "SI")))
 +  "gs264e_alu1")
 +
 +(define_insn_reservation "gs264e_idiv_di" 25
 +  (and (eq_attr "cpu" "gs264e")
 +       (and (eq_attr "type" "idiv")
 +	    (eq_attr "mode" "DI")))
 +  "gs264e_alu1")
 +
 +(define_insn_reservation "gs264e_load" 4
 +  (and (eq_attr "cpu" "gs264e")
 +       (eq_attr "type" "load"))
 +  "gs264e_mem1")
 +
 +(define_insn_reservation "gs264e_fpload" 4
 +  (and (eq_attr "cpu" "gs264e")
 +       (eq_attr "type" "load,mfc,mtc"))
 +  "gs264e_mem1")
 +
 +(define_insn_reservation "gs264e_prefetch" 0
 +  (and (eq_attr "cpu" "gs264e")
 +       (eq_attr "type" "prefetch,prefetchx"))
 +  "gs264e_mem1")
 +
 +(define_insn_reservation "gs264e_store" 0
 +  (and (eq_attr "cpu" "gs264e")
 +       (eq_attr "type" "store,fpstore,fpidxstore"))
 +  "gs264e_mem1")
 +
 +(define_insn_reservation "gs264e_fadd" 4
 +  (and (eq_attr "cpu" "gs264e")
 +       (eq_attr "type" "fadd,fmul,fmadd"))
 +  "gs264e_falu1")
 +
 +(define_insn_reservation "gs264e_fcmp" 2
 +  (and (eq_attr "cpu" "gs264e")
 +       (eq_attr "type" "fabs,fcmp,fmove,fneg"))
 +  "gs264e_falu1")
 +
 +(define_insn_reservation "gs264e_fcvt" 4
 +  (and (eq_attr "cpu" "gs264e")
 +       (eq_attr "type" "fcvt"))
 +  "gs264e_falu1")
 +
 +(define_insn_reservation "gs264e_fdiv_sf" 12
 +  (and (eq_attr "cpu" "gs264e")
 +       (and (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt")
 +	    (eq_attr "mode" "SF")))
 +  "gs264e_falu1")
 +
 +(define_insn_reservation "gs264e_fdiv_df" 19
 +  (and (eq_attr "cpu" "gs264e")
 +       (and (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt")
 +	    (eq_attr "mode" "DF")))
 +  "gs264e_falu1")
 +
 +;; Force single-dispatch for unknown or multi.
 +(define_insn_reservation "gs264e_unknown" 1
 +  (and (eq_attr "cpu" "gs264e")
 +       (eq_attr "type" "unknown,multi,atomic,syncloop"))
 +  "gs264e_alu1 + gs264e_falu1 + gs264e_mem1")
 +
 +;; End of DFA-based pipeline description for gs264e
 diff --git a/gcc/config/mips/gs464.md b/gcc/config/mips/gs464.md
 new file mode 100644
 index 0000000..82efb66
 --- /dev/null
 +++ b/gcc/config/mips/gs464.md
 @@ -0,0 +1,137 @@
 +;; Pipeline model for Loongson gs464 cores.
 +
 +;; Copyright (C) 2011-2018 Free Software Foundation, Inc.
 +;;
 +;; This file is part of GCC.
 +;;
 +;; GCC is free software; you can redistribute it and/or modify it
 +;; under the terms of the GNU General Public License as published
 +;; by the Free Software Foundation; either version 3, or (at your
 +;; option) any later version.
 +;;
 +;; GCC is distributed in the hope that it will be useful, but WITHOUT
 +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 +;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
 +;; License for more details.
 +;;
 +;; You should have received a copy of the GNU General Public License
 +;; along with GCC; see the file COPYING3.  If not see
 +;; <http://www.gnu.org/licenses/>.
 +
 +;; Uncomment the following line to output automata for debugging.
 +;; (automata_option "v")
 +
 +;; Automaton for integer instructions.
 +(define_automaton "gs464_a_alu")
 +
 +;; Automaton for floating-point instructions.
 +(define_automaton "gs464_a_falu")
 +
 +;; Automaton for memory operations.
 +(define_automaton "gs464_a_mem")
 +
 +;; Describe the resources.
 +
 +(define_cpu_unit "gs464_alu1" "gs464_a_alu")
 +(define_cpu_unit "gs464_alu2" "gs464_a_alu")
 +(define_cpu_unit "gs464_mem" "gs464_a_mem")
 +(define_cpu_unit "gs464_falu1" "gs464_a_falu")
 +(define_cpu_unit "gs464_falu2" "gs464_a_falu")
 +
 +;; Describe instruction reservations.
 +
 +(define_insn_reservation "gs464_arith" 1
 +  (and (eq_attr "cpu" "gs464")
 +       (eq_attr "type" "arith,clz,const,logical,
 +			move,nop,shift,signext,slt"))
 +  "gs464_alu1 | gs464_alu2")
 +
 +(define_insn_reservation "gs464_branch" 1
 +  (and (eq_attr "cpu" "gs464")
 +       (eq_attr "type" "branch,jump,call,condmove,trap"))
 +  "gs464_alu1")
 +
 +(define_insn_reservation "gs464_mfhilo" 1
 +  (and (eq_attr "cpu" "gs464")
 +       (eq_attr "type" "mfhi,mflo,mthi,mtlo"))
 +  "gs464_alu2")
 +
 +;; Operation imul3nc is fully pipelined.
 +(define_insn_reservation "gs464_imul3nc" 5
 +  (and (eq_attr "cpu" "gs464")
 +       (eq_attr "type" "imul3nc"))
 +  "gs464_alu2")
 +
 +(define_insn_reservation "gs464_imul" 7
 +  (and (eq_attr "cpu" "gs464")
 +       (eq_attr "type" "imul,imadd"))
 +  "gs464_alu2 * 7")
 +
 +(define_insn_reservation "gs464_idiv_si" 12
 +  (and (eq_attr "cpu" "gs464")
 +       (and (eq_attr "type" "idiv")
 +	    (eq_attr "mode" "SI")))
 +  "gs464_alu2 * 12")
 +
 +(define_insn_reservation "gs464_idiv_di" 25
 +  (and (eq_attr "cpu" "gs464")
 +       (and (eq_attr "type" "idiv")
 +	    (eq_attr "mode" "DI")))
 +  "gs464_alu2 * 25")
 +
 +(define_insn_reservation "gs464_load" 3
 +  (and (eq_attr "cpu" "gs464")
 +       (eq_attr "type" "load"))
 +  "gs464_mem")
 +
 +(define_insn_reservation "gs464_fpload" 4
 +  (and (eq_attr "cpu" "gs464")
 +       (eq_attr "type" "load,mfc,mtc"))
 +  "gs464_mem")
 +
 +(define_insn_reservation "gs464_prefetch" 0
 +  (and (eq_attr "cpu" "gs464")
 +       (eq_attr "type" "prefetch,prefetchx"))
 +  "gs464_mem")
 +
 +(define_insn_reservation "gs464_store" 0
 +  (and (eq_attr "cpu" "gs464")
 +       (eq_attr "type" "store,fpstore,fpidxstore"))
 +  "gs464_mem")
 +
 +;; All the fp operations can be executed in FALU1.  Only fp add,
 +;; sub, mul, madd can be executed in FALU2.  Try FALU2 firstly.
 +(define_insn_reservation "gs464_fadd" 6
 +  (and (eq_attr "cpu" "gs464")
 +       (eq_attr "type" "fadd,fmul,fmadd"))
 +  "gs464_falu2 | gs464_falu1")
 +
 +(define_insn_reservation "gs464_fcmp" 2
 +  (and (eq_attr "cpu" "gs464")
 +       (eq_attr "type" "fabs,fcmp,fmove,fneg"))
 +  "gs464_falu1")
 +
 +(define_insn_reservation "gs464_fcvt" 4
 +  (and (eq_attr "cpu" "gs464")
 +       (eq_attr "type" "fcvt"))
 +  "gs464_falu1")
 +
 +(define_insn_reservation "gs464_fdiv_sf" 12
 +  (and (eq_attr "cpu" "gs464")
 +       (and (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt")
 +	    (eq_attr "mode" "SF")))
 +  "gs464_falu1 * 12")
 +
 +(define_insn_reservation "gs464_fdiv_df" 19
 +  (and (eq_attr "cpu" "gs464")
 +       (and (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt")
 +	    (eq_attr "mode" "DF")))
 +  "gs464_falu1 * 19")
 +
 +;; Force single-dispatch for unknown or multi.
 +(define_insn_reservation "gs464_unknown" 1
 +  (and (eq_attr "cpu" "gs464")
 +       (eq_attr "type" "unknown,multi,atomic,syncloop"))
 +  "gs464_alu1 + gs464_alu2 + gs464_falu1 + gs464_falu2 + gs464_mem")
 +
 +;; End of DFA-based pipeline description for gs464
 diff --git a/gcc/config/mips/gs464e.md b/gcc/config/mips/gs464e.md
 new file mode 100644
 index 0000000..e2ef37d
 --- /dev/null
 +++ b/gcc/config/mips/gs464e.md
 @@ -0,0 +1,137 @@
 +;; Pipeline model for Loongson gs464e cores.
 +
 +;; Copyright (C) 2011-2018 Free Software Foundation, Inc.
 +;;
 +;; This file is part of GCC.
 +;;
 +;; GCC is free software; you can redistribute it and/or modify it
 +;; under the terms of the GNU General Public License as published
 +;; by the Free Software Foundation; either version 3, or (at your
 +;; option) any later version.
 +;;
 +;; GCC is distributed in the hope that it will be useful, but WITHOUT
 +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 +;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
 +;; License for more details.
 +;;
 +;; You should have received a copy of the GNU General Public License
 +;; along with GCC; see the file COPYING3.  If not see
 +;; <http://www.gnu.org/licenses/>.
 +
 +;; Uncomment the following line to output automata for debugging.
 +;; (automata_option "v")
 +
 +;; Automaton for integer instructions.
 +(define_automaton "gs464e_a_alu")
 +
 +;; Automaton for floating-point instructions.
 +(define_automaton "gs464e_a_falu")
 +
 +;; Automaton for memory operations.
 +(define_automaton "gs464e_a_mem")
 +
 +;; Describe the resources.
 +
 +(define_cpu_unit "gs464e_alu1" "gs464e_a_alu")
 +(define_cpu_unit "gs464e_alu2" "gs464e_a_alu")
 +(define_cpu_unit "gs464e_mem1" "gs464e_a_mem")
 +(define_cpu_unit "gs464e_mem2" "gs464e_a_mem")
 +(define_cpu_unit "gs464e_falu1" "gs464e_a_falu")
 +(define_cpu_unit "gs464e_falu2" "gs464e_a_falu")
 +
 +;; Describe instruction reservations.
 +
 +(define_insn_reservation "gs464e_arith" 1
 +  (and (eq_attr "cpu" "gs464e")
 +       (eq_attr "type" "arith,clz,const,logical,
 +			move,nop,shift,signext,slt"))
 +  "gs464e_alu1 | gs464e_alu2")
 +
 +(define_insn_reservation "gs464e_branch" 1
 +  (and (eq_attr "cpu" "gs464e")
 +       (eq_attr "type" "branch,jump,call,condmove,trap"))
 +  "gs464e_alu1 | gs464e_alu2")
 +
 +(define_insn_reservation "gs464e_mfhilo" 1
 +  (and (eq_attr "cpu" "gs464e")
 +       (eq_attr "type" "mfhi,mflo,mthi,mtlo"))
 +  "gs464e_alu1 | gs464e_alu2")
 +
 +;; Operation imul3nc is fully pipelined.
 +(define_insn_reservation "gs464e_imul3nc" 5
 +  (and (eq_attr "cpu" "gs464e")
 +       (eq_attr "type" "imul3nc"))
 +  "gs464e_alu1 | gs464e_alu2")
 +
 +(define_insn_reservation "gs464e_imul" 7
 +  (and (eq_attr "cpu" "gs464e")
 +       (eq_attr "type" "imul,imadd"))
 +  "gs464e_alu1 | gs464e_alu2")
 +
 +(define_insn_reservation "gs464e_idiv_si" 12
 +  (and (eq_attr "cpu" "gs464e")
 +       (and (eq_attr "type" "idiv")
 +	    (eq_attr "mode" "SI")))
 +  "gs464e_alu1 | gs464e_alu2")
 +
 +(define_insn_reservation "gs464e_idiv_di" 25
 +  (and (eq_attr "cpu" "gs464e")
 +       (and (eq_attr "type" "idiv")
 +	    (eq_attr "mode" "DI")))
 +  "gs464e_alu1 | gs464e_alu2")
 +
 +(define_insn_reservation "gs464e_load" 4
 +  (and (eq_attr "cpu" "gs464e")
 +       (eq_attr "type" "load"))
 +  "gs464e_mem1 | gs464e_mem2")
 +
 +(define_insn_reservation "gs464e_fpload" 5
 +  (and (eq_attr "cpu" "gs464e")
 +       (eq_attr "type" "load,mfc,mtc"))
 +  "gs464e_mem1 | gs464e_mem2")
 +
 +(define_insn_reservation "gs464e_prefetch" 0
 +  (and (eq_attr "cpu" "gs464e")
 +       (eq_attr "type" "prefetch,prefetchx"))
 +  "gs464e_mem1 | gs464e_mem2")
 +
 +(define_insn_reservation "gs464e_store" 0
 +  (and (eq_attr "cpu" "gs464e")
 +       (eq_attr "type" "store,fpstore,fpidxstore"))
 +  "gs464e_mem1 | gs464e_mem2")
 +
 +(define_insn_reservation "gs464e_fadd" 4
 +  (and (eq_attr "cpu" "gs464e")
 +       (eq_attr "type" "fadd,fmul,fmadd"))
 +  "gs464e_falu1 | gs464e_falu2")
 +
 +(define_insn_reservation "gs464e_fcmp" 2
 +  (and (eq_attr "cpu" "gs464e")
 +       (eq_attr "type" "fabs,fcmp,fmove,fneg"))
 +  "gs464e_falu1 | gs464e_falu2")
 +
 +(define_insn_reservation "gs464e_fcvt" 4
 +  (and (eq_attr "cpu" "gs464e")
 +       (eq_attr "type" "fcvt"))
 +  "gs464e_falu1 | gs464e_falu2")
 +
 +(define_insn_reservation "gs464e_fdiv_sf" 12
 +  (and (eq_attr "cpu" "gs464e")
 +       (and (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt")
 +	    (eq_attr "mode" "SF")))
 +  "gs464e_falu1 | gs464e_falu2")
 +
 +(define_insn_reservation "gs464e_fdiv_df" 19
 +  (and (eq_attr "cpu" "gs464e")
 +       (and (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt")
 +	    (eq_attr "mode" "DF")))
 +  "gs464e_falu1 | gs464e_falu2")
 +
 +;; Force single-dispatch for unknown or multi.
 +(define_insn_reservation "gs464e_unknown" 1
 +  (and (eq_attr "cpu" "gs464e")
 +       (eq_attr "type" "unknown,multi,atomic,syncloop"))
 +  "gs464e_alu1 + gs464e_alu2 + gs464e_falu1
 +   + gs464e_falu2 + gs464e_mem1 + gs464e_mem2")
 +
 +;; End of DFA-based pipeline description for gs464e
 diff --git a/gcc/config/mips/loongson-mmi.md b/gcc/config/mips/loongson-mmi.md
 new file mode 100644
 index 0000000..ad23f67
 --- /dev/null
 +++ b/gcc/config/mips/loongson-mmi.md
 @@ -0,0 +1,903 @@
 +;; Machine description for Loongson MultiMedia extensions Instructions (MMI).
 +;; Copyright (C) 2008-2018 Free Software Foundation, Inc.
 +;; Contributed by CodeSourcery.
 +;;
 +;; This file is part of GCC.
 +;;
 +;; GCC is free software; you can redistribute it and/or modify
 +;; it under the terms of the GNU General Public License as published by
 +;; the Free Software Foundation; either version 3, or (at your option)
 +;; any later version.
 +
 +;; GCC is distributed in the hope that it will be useful,
 +;; but WITHOUT ANY WARRANTY; without even the implied warranty of
 +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 +;; GNU General Public License for more details.
 +
 +;; You should have received a copy of the GNU General Public License
 +;; along with GCC; see the file COPYING3.  If not see
 +;; <http://www.gnu.org/licenses/>.
 +
 +(define_c_enum "unspec" [
 +  UNSPEC_LOONGSON_PAVG
 +  UNSPEC_LOONGSON_PCMPEQ
 +  UNSPEC_LOONGSON_PCMPGT
 +  UNSPEC_LOONGSON_PEXTR
 +  UNSPEC_LOONGSON_PINSRH
 +  UNSPEC_LOONGSON_VINIT
 +  UNSPEC_LOONGSON_PMADD
 +  UNSPEC_LOONGSON_PMOVMSK
 +  UNSPEC_LOONGSON_PMULHU
 +  UNSPEC_LOONGSON_PMULH
 +  UNSPEC_LOONGSON_PMULU
 +  UNSPEC_LOONGSON_PASUBUB
 +  UNSPEC_LOONGSON_BIADD
 +  UNSPEC_LOONGSON_PSADBH
 +  UNSPEC_LOONGSON_PSHUFH
 +  UNSPEC_LOONGSON_PUNPCKH
 +  UNSPEC_LOONGSON_PUNPCKL
 +  UNSPEC_LOONGSON_PADDD
 +  UNSPEC_LOONGSON_PSUBD
 +  UNSPEC_LOONGSON_DSLL
 +  UNSPEC_LOONGSON_DSRL
 +])
 +
 +;; Mode iterators and attributes.
 +
 +;; 64-bit vectors of bytes.
 +(define_mode_iterator VB [V8QI])
 +
 +;; 64-bit vectors of halfwords.
 +(define_mode_iterator VH [V4HI])
 +
 +;; 64-bit vectors of words.
 +(define_mode_iterator VW [V2SI])
 +
 +;; 64-bit vectors of halfwords and bytes.
 +(define_mode_iterator VHB [V4HI V8QI])
 +
 +;; 64-bit vectors of words and halfwords.
 +(define_mode_iterator VWH [V2SI V4HI])
 +
 +;; 64-bit vectors of words and bytes
 +(define_mode_iterator VWB [V2SI V8QI])
 +
 +;; 64-bit vectors of words, halfwords and bytes.
 +(define_mode_iterator VWHB [V2SI V4HI V8QI])
 +
 +;; 64-bit vectors of words, halfwords and bytes; and DImode.
 +(define_mode_iterator VWHBDI [V2SI V4HI V8QI DI])
 +
 +;; The Loongson instruction suffixes corresponding to the modes in the
 +;; VWHBDI iterator.
 +(define_mode_attr V_suffix [(V2SI "w") (V4HI "h") (V8QI "b") (DI "d")])
 +
 +;; Given a vector type T, the mode of a vector half the size of T
 +;; and with the same number of elements.
 +(define_mode_attr V_squash [(V2SI "V2HI") (V4HI "V4QI")])
 +
 +;; Given a vector type T, the mode of a vector the same size as T
 +;; but with half as many elements.
 +(define_mode_attr V_stretch_half [(V2SI "DI") (V4HI "V2SI") (V8QI "V4HI")])
 +
 +;; The Loongson instruction suffixes corresponding to the transformation
 +;; expressed by V_stretch_half.
 +(define_mode_attr V_stretch_half_suffix [(V2SI "wd") (V4HI "hw") (V8QI "bh")])
 +
 +;; Given a vector type T, the mode of a vector the same size as T
 +;; but with twice as many elements.
 +(define_mode_attr V_squash_double [(V2SI "V4HI") (V4HI "V8QI")])
 +
 +;; Given a vector type T, the inner mode.
 +(define_mode_attr V_inner [(V8QI "QI") (V4HI "HI") (V2SI "SI")])
 +
 +;; The Loongson instruction suffixes corresponding to the conversions
 +;; specified by V_half_width.
 +(define_mode_attr V_squash_double_suffix [(V2SI "wh") (V4HI "hb")])
 +
 +;; Move patterns.
 +
 +;; Expander to legitimize moves involving values of vector modes.
 +(define_expand "mov<mode>"
 +  [(set (match_operand:VWHB 0)
 +	(match_operand:VWHB 1))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +{
 +  if (mips_legitimize_move (<MODE>mode, operands[0], operands[1]))
 +    DONE;
 +})
 +
 +;; Handle legitimized moves between values of vector modes.
 +(define_insn "mov<mode>_internal"
 +  [(set (match_operand:VWHB 0 "nonimmediate_operand" "=m,f,d,f,  d,  m,  d")
 +	(match_operand:VWHB 1 "move_operand"	     "f,m,f,dYG,dYG,dYG,m"))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  { return mips_output_move (operands[0], operands[1]); }
 +  [(set_attr "move_type" "fpstore,fpload,mfc,mtc,move,store,load")
 +   (set_attr "mode" "DI")])
 +
 +;; Initialization of a vector.
 +
 +(define_expand "vec_init<mode><unitmode>"
 +  [(set (match_operand:VWHB 0 "register_operand")
 +	(match_operand 1 ""))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +{
 +  mips_expand_vector_init (operands[0], operands[1]);
 +  DONE;
 +})
 +
 +;; Helper for vec_init.  Initialize element 0 of the output from the input.
 +;; All other elements are undefined.
 +(define_insn "loongson_vec_init1_<mode>"
 +  [(set (match_operand:VHB 0 "register_operand" "=f")
 +	(unspec:VHB [(truncate:<V_inner>
 +		       (match_operand:DI 1 "reg_or_0_operand" "Jd"))]
 +		    UNSPEC_LOONGSON_VINIT))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "dmtc1\t%z1,%0"
 +  [(set_attr "move_type" "mtc")
 +   (set_attr "mode" "DI")])
 +
 +;; Helper for vec_initv2si.
 +(define_insn "*vec_concatv2si"
 +  [(set (match_operand:V2SI 0 "register_operand" "=f")
 +	(vec_concat:V2SI
 +	  (match_operand:SI 1 "register_operand" "f")
 +	  (match_operand:SI 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "punpcklwd\t%0,%1,%2"
 +  [(set_attr "type" "fcvt")])
 +
 +;; Instruction patterns for SIMD instructions.
 +
 +;; Pack with signed saturation.
 +(define_insn "vec_pack_ssat_<mode>"
 +  [(set (match_operand:<V_squash_double> 0 "register_operand" "=f")
 +	(vec_concat:<V_squash_double>
 +	 (ss_truncate:<V_squash>
 +	  (match_operand:VWH 1 "register_operand" "f"))
 +	 (ss_truncate:<V_squash>
 +	  (match_operand:VWH 2 "register_operand" "f"))))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "packss<V_squash_double_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fmul")])
 +
 +;; Pack with unsigned saturation.
 +(define_insn "vec_pack_usat_<mode>"
 +  [(set (match_operand:<V_squash_double> 0 "register_operand" "=f")
 +	(vec_concat:<V_squash_double>
 +	 (us_truncate:<V_squash>
 +	  (match_operand:VH 1 "register_operand" "f"))
 +	 (us_truncate:<V_squash>
 +	  (match_operand:VH 2 "register_operand" "f"))))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "packus<V_squash_double_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fmul")])
 +
 +;; Addition, treating overflow by wraparound.
 +(define_insn "add<mode>3"
 +  [(set (match_operand:VWHB 0 "register_operand" "=f")
 +	(plus:VWHB (match_operand:VWHB 1 "register_operand" "f")
 +		   (match_operand:VWHB 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "padd<V_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fadd")])
 +
 +;; Addition of doubleword integers stored in FP registers.
 +;; Overflow is treated by wraparound.
 +;; We use 'unspec' instead of 'plus' here to avoid clash with
 +;; mips.md::add<mode>3.  If 'plus' was used, then such instruction
 +;; would be recognized as adddi3 and reload would make it use
 +;; GPRs instead of FPRs.
 +(define_insn "loongson_paddd"
 +  [(set (match_operand:DI 0 "register_operand" "=f")
 +	(unspec:DI [(match_operand:DI 1 "register_operand" "f")
 +		    (match_operand:DI 2 "register_operand" "f")]
 +		   UNSPEC_LOONGSON_PADDD))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "paddd\t%0,%1,%2"
 +  [(set_attr "type" "fadd")])
 +
 +;; Addition, treating overflow by signed saturation.
 +(define_insn "ssadd<mode>3"
 +  [(set (match_operand:VHB 0 "register_operand" "=f")
 +	(ss_plus:VHB (match_operand:VHB 1 "register_operand" "f")
 +		     (match_operand:VHB 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "padds<V_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fadd")])
 +
 +;; Addition, treating overflow by unsigned saturation.
 +(define_insn "usadd<mode>3"
 +  [(set (match_operand:VHB 0 "register_operand" "=f")
 +	(us_plus:VHB (match_operand:VHB 1 "register_operand" "f")
 +		     (match_operand:VHB 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "paddus<V_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fadd")])
 +
 +;; Logical AND NOT.
 +(define_insn "loongson_pandn_<V_suffix>"
 +  [(set (match_operand:VWHBDI 0 "register_operand" "=f")
 +	(and:VWHBDI
 +	 (not:VWHBDI (match_operand:VWHBDI 1 "register_operand" "f"))
 +	 (match_operand:VWHBDI 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pandn\t%0,%1,%2"
 +  [(set_attr "type" "fmul")])
 +
 +;; Logical AND.
 +(define_insn "and<mode>3"
 +  [(set (match_operand:VWHB 0 "register_operand" "=f")
 +	(and:VWHB (match_operand:VWHB 1 "register_operand" "f")
 +		  (match_operand:VWHB 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "and\t%0,%1,%2"
 +  [(set_attr "type" "fmul")])
 +
 +;; Logical OR.
 +(define_insn "ior<mode>3"
 +  [(set (match_operand:VWHB 0 "register_operand" "=f")
 +	(ior:VWHB (match_operand:VWHB 1 "register_operand" "f")
 +		  (match_operand:VWHB 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "or\t%0,%1,%2"
 +  [(set_attr "type" "fcvt")])
 +
 +;; Logical XOR.
 +(define_insn "xor<mode>3"
 +  [(set (match_operand:VWHB 0 "register_operand" "=f")
 +	(xor:VWHB (match_operand:VWHB 1 "register_operand" "f")
 +		  (match_operand:VWHB 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "xor\t%0,%1,%2"
 +  [(set_attr "type" "fmul")])
 +
 +;; Logical NOR.
 +(define_insn "*loongson_nor"
 +  [(set (match_operand:VWHB 0 "register_operand" "=f")
 +	(and:VWHB
 +	  (not:VWHB (match_operand:VWHB 1 "register_operand" "f"))
 +	  (not:VWHB (match_operand:VWHB 2 "register_operand" "f"))))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "nor\t%0,%1,%2"
 +  [(set_attr "type" "fmul")])
 +
 +;; Logical NOT.
 +(define_insn "one_cmpl<mode>2"
 +  [(set (match_operand:VWHB 0 "register_operand" "=f")
 +	(not:VWHB (match_operand:VWHB 1 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "nor\t%0,%1,%1"
 +  [(set_attr "type" "fmul")])
 +
 +;; Average.
 +(define_insn "loongson_pavg<V_suffix>"
 +  [(set (match_operand:VHB 0 "register_operand" "=f")
 +	(unspec:VHB [(match_operand:VHB 1 "register_operand" "f")
 +		     (match_operand:VHB 2 "register_operand" "f")]
 +		    UNSPEC_LOONGSON_PAVG))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pavg<V_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fadd")])
 +
 +;; Equality test.
 +(define_insn "loongson_pcmpeq<V_suffix>"
 +  [(set (match_operand:VWHB 0 "register_operand" "=f")
 +	(unspec:VWHB [(match_operand:VWHB 1 "register_operand" "f")
 +		      (match_operand:VWHB 2 "register_operand" "f")]
 +		     UNSPEC_LOONGSON_PCMPEQ))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pcmpeq<V_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fadd")])
 +
 +;; Greater-than test.
 +(define_insn "loongson_pcmpgt<V_suffix>"
 +  [(set (match_operand:VWHB 0 "register_operand" "=f")
 +	(unspec:VWHB [(match_operand:VWHB 1 "register_operand" "f")
 +		      (match_operand:VWHB 2 "register_operand" "f")]
 +		     UNSPEC_LOONGSON_PCMPGT))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pcmpgt<V_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fadd")])
 +
 +;; Extract halfword.
 +(define_insn "loongson_pextrh"
 +  [(set (match_operand:V4HI 0 "register_operand" "=f")
 +	(unspec:V4HI [(match_operand:V4HI 1 "register_operand" "f")
 +		      (match_operand:SI 2 "register_operand" "f")]
 +		   UNSPEC_LOONGSON_PEXTR))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pextrh\t%0,%1,%2"
 +  [(set_attr "type" "fcvt")])
 +
 +;; Insert halfword.
 +(define_insn "loongson_pinsrh_0"
 +  [(set (match_operand:V4HI 0 "register_operand" "=f")
 +	(vec_select:V4HI
 +	  (vec_concat:V8HI
 +	    (match_operand:V4HI 1 "register_operand" "f")
 +	    (match_operand:V4HI 2 "register_operand" "f"))
 +	  (parallel [(const_int 4) (const_int 1)
 +		     (const_int 2) (const_int 3)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pinsrh_0\t%0,%1,%2"
 +  [(set_attr "type" "fdiv")])
 +
 +(define_insn "loongson_pinsrh_1"
 +  [(set (match_operand:V4HI 0 "register_operand" "=f")
 +	(vec_select:V4HI
 +	  (vec_concat:V8HI
 +	    (match_operand:V4HI 1 "register_operand" "f")
 +	    (match_operand:V4HI 2 "register_operand" "f"))
 +	  (parallel [(const_int 0) (const_int 4)
 +		     (const_int 2) (const_int 3)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pinsrh_1\t%0,%1,%2"
 +  [(set_attr "type" "fdiv")])
 +
 +(define_insn "loongson_pinsrh_2"
 +  [(set (match_operand:V4HI 0 "register_operand" "=f")
 +	(vec_select:V4HI
 +	  (vec_concat:V8HI
 +	    (match_operand:V4HI 1 "register_operand" "f")
 +	    (match_operand:V4HI 2 "register_operand" "f"))
 +	  (parallel [(const_int 0) (const_int 1)
 +		     (const_int 4) (const_int 3)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pinsrh_2\t%0,%1,%2"
 +  [(set_attr "type" "fdiv")])
 +
 +(define_insn "loongson_pinsrh_3"
 +  [(set (match_operand:V4HI 0 "register_operand" "=f")
 +	(vec_select:V4HI
 +	  (vec_concat:V8HI
 +	    (match_operand:V4HI 1 "register_operand" "f")
 +	    (match_operand:V4HI 2 "register_operand" "f"))
 +	  (parallel [(const_int 0) (const_int 1)
 +		     (const_int 2) (const_int 4)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pinsrh_3\t%0,%1,%2"
 +  [(set_attr "type" "fdiv")])
 +
 +(define_insn "*vec_setv4hi"
 +  [(set (match_operand:V4HI 0 "register_operand" "=f")
 +	(unspec:V4HI [(match_operand:V4HI 1 "register_operand" "f")
 +		      (match_operand:SI 2 "register_operand" "f")
 +		      (match_operand:SI 3 "const_0_to_3_operand" "")]
 +		     UNSPEC_LOONGSON_PINSRH))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pinsrh_%3\t%0,%1,%2"
 +  [(set_attr "type" "fdiv")])
 +
 +(define_expand "vec_setv4hi"
 +  [(set (match_operand:V4HI 0 "register_operand" "=f")
 +	(unspec:V4HI [(match_operand:V4HI 1 "register_operand" "f")
 +		      (match_operand:HI 2 "register_operand" "f")
 +		      (match_operand:SI 3 "const_0_to_3_operand" "")]
 +		     UNSPEC_LOONGSON_PINSRH))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +{
 +  rtx ext = gen_reg_rtx (SImode);
 +  emit_move_insn (ext, gen_lowpart (SImode, operands[2]));
 +  operands[2] = ext;
 +})
 +
 +;; Multiply and add packed integers.
 +(define_insn "loongson_pmaddhw"
 +  [(set (match_operand:V2SI 0 "register_operand" "=f")
 +	(unspec:V2SI [(match_operand:V4HI 1 "register_operand" "f")
 +		      (match_operand:V4HI 2 "register_operand" "f")]
 +		     UNSPEC_LOONGSON_PMADD))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pmaddhw\t%0,%1,%2"
 +  [(set_attr "type" "fmul")])
 +
 +(define_expand "sdot_prodv4hi"
 +  [(match_operand:V2SI 0 "register_operand" "")
 +   (match_operand:V4HI 1 "register_operand" "")
 +   (match_operand:V4HI 2 "register_operand" "")
 +   (match_operand:V2SI 3 "register_operand" "")]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +{
 +  rtx t = gen_reg_rtx (V2SImode);
 +  emit_insn (gen_loongson_pmaddhw (t, operands[1], operands[2]));
 +  emit_insn (gen_addv2si3 (operands[0], t, operands[3]));
 +  DONE;
 +})
 +
 +;; Maximum of signed halfwords.
 +(define_insn "smaxv4hi3"
 +  [(set (match_operand:V4HI 0 "register_operand" "=f")
 +	(smax:V4HI (match_operand:V4HI 1 "register_operand" "f")
 +		   (match_operand:V4HI 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pmaxsh\t%0,%1,%2"
 +  [(set_attr "type" "fadd")])
 +
 +(define_expand "smax<mode>3"
 +  [(match_operand:VWB 0 "register_operand" "")
 +   (match_operand:VWB 1 "register_operand" "")
 +   (match_operand:VWB 2 "register_operand" "")]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +{
 +  mips_expand_vec_minmax (operands[0], operands[1], operands[2],
 +			  gen_loongson_pcmpgt<V_suffix>, false);
 +  DONE;
 +})
 +
 +;; Maximum of unsigned bytes.
 +(define_insn "umaxv8qi3"
 +  [(set (match_operand:V8QI 0 "register_operand" "=f")
 +	(umax:V8QI (match_operand:V8QI 1 "register_operand" "f")
 +		   (match_operand:V8QI 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pmaxub\t%0,%1,%2"
 +  [(set_attr "type" "fadd")])
 +
 +;; Minimum of signed halfwords.
 +(define_insn "sminv4hi3"
 +  [(set (match_operand:V4HI 0 "register_operand" "=f")
 +	(smin:V4HI (match_operand:V4HI 1 "register_operand" "f")
 +		   (match_operand:V4HI 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pminsh\t%0,%1,%2"
 +  [(set_attr "type" "fadd")])
 +
 +(define_expand "smin<mode>3"
 +  [(match_operand:VWB 0 "register_operand" "")
 +   (match_operand:VWB 1 "register_operand" "")
 +   (match_operand:VWB 2 "register_operand" "")]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +{
 +  mips_expand_vec_minmax (operands[0], operands[1], operands[2],
 +			  gen_loongson_pcmpgt<V_suffix>, true);
 +  DONE;
 +})
 +
 +;; Minimum of unsigned bytes.
 +(define_insn "uminv8qi3"
 +  [(set (match_operand:V8QI 0 "register_operand" "=f")
 +	(umin:V8QI (match_operand:V8QI 1 "register_operand" "f")
 +		   (match_operand:V8QI 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pminub\t%0,%1,%2"
 +  [(set_attr "type" "fadd")])
 +
 +;; Move byte mask.
 +(define_insn "loongson_pmovmsk<V_suffix>"
 +  [(set (match_operand:VB 0 "register_operand" "=f")
 +	(unspec:VB [(match_operand:VB 1 "register_operand" "f")]
 +		   UNSPEC_LOONGSON_PMOVMSK))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pmovmsk<V_suffix>\t%0,%1"
 +  [(set_attr "type" "fabs")])
 +
 +;; Multiply unsigned integers and store high result.
 +(define_insn "umul<mode>3_highpart"
 +  [(set (match_operand:VH 0 "register_operand" "=f")
 +	(unspec:VH [(match_operand:VH 1 "register_operand" "f")
 +		    (match_operand:VH 2 "register_operand" "f")]
 +		   UNSPEC_LOONGSON_PMULHU))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pmulhu<V_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fmul")])
 +
 +;; Multiply signed integers and store high result.
 +(define_insn "smul<mode>3_highpart"
 +  [(set (match_operand:VH 0 "register_operand" "=f")
 +	(unspec:VH [(match_operand:VH 1 "register_operand" "f")
 +		    (match_operand:VH 2 "register_operand" "f")]
 +		   UNSPEC_LOONGSON_PMULH))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pmulh<V_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fmul")])
 +
 +;; Multiply signed integers and store low result.
 +(define_insn "mul<mode>3"
 +  [(set (match_operand:VH 0 "register_operand" "=f")
 +	(mult:VH (match_operand:VH 1 "register_operand" "f")
 +		 (match_operand:VH 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pmull<V_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fmul")])
 +
 +;; Multiply unsigned word integers.
 +(define_insn "loongson_pmulu<V_suffix>"
 +  [(set (match_operand:DI 0 "register_operand" "=f")
 +	(unspec:DI [(match_operand:VW 1 "register_operand" "f")
 +		    (match_operand:VW 2 "register_operand" "f")]
 +		   UNSPEC_LOONGSON_PMULU))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pmulu<V_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fmul")])
 +
 +;; Absolute difference.
 +(define_insn "loongson_pasubub"
 +  [(set (match_operand:VB 0 "register_operand" "=f")
 +	(unspec:VB [(match_operand:VB 1 "register_operand" "f")
 +		    (match_operand:VB 2 "register_operand" "f")]
 +		   UNSPEC_LOONGSON_PASUBUB))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pasubub\t%0,%1,%2"
 +  [(set_attr "type" "fadd")])
 +
 +;; Sum of unsigned byte integers.
 +(define_insn "loongson_biadd"
 +  [(set (match_operand:<V_stretch_half> 0 "register_operand" "=f")
 +	(unspec:<V_stretch_half> [(match_operand:VB 1 "register_operand" "f")]
 +				 UNSPEC_LOONGSON_BIADD))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "biadd\t%0,%1"
 +  [(set_attr "type" "fabs")])
 +
 +(define_insn "reduc_uplus_v8qi"
 +  [(set (match_operand:V8QI 0 "register_operand" "=f")
 +	(unspec:V8QI [(match_operand:V8QI 1 "register_operand" "f")]
 +		     UNSPEC_LOONGSON_BIADD))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "biadd\t%0,%1"
 +  [(set_attr "type" "fabs")])
 +
 +;; Sum of absolute differences.
 +(define_insn "loongson_psadbh"
 +  [(set (match_operand:<V_stretch_half> 0 "register_operand" "=f")
 +	(unspec:<V_stretch_half> [(match_operand:VB 1 "register_operand" "f")
 +				  (match_operand:VB 2 "register_operand" "f")]
 +				 UNSPEC_LOONGSON_PSADBH))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pasubub\t%0,%1,%2;biadd\t%0,%0"
 +  [(set_attr "type" "fadd")])
 +
 +;; Shuffle halfwords.
 +(define_insn "loongson_pshufh"
 +  [(set (match_operand:VH 0 "register_operand" "=f")
 +	(unspec:VH [(match_operand:VH 1 "register_operand" "f")
 +		    (match_operand:SI 2 "register_operand" "f")]
 +		   UNSPEC_LOONGSON_PSHUFH))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "pshufh\t%0,%1,%2"
 +  [(set_attr "type" "fmul")])
 +
 +;; Shift left logical.
 +(define_insn "ashl<mode>3"
 +  [(set (match_operand:VWH 0 "register_operand" "=f")
 +	(ashift:VWH (match_operand:VWH 1 "register_operand" "f")
 +		    (match_operand:SI 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "psll<V_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fcvt")])
 +
 +;; Shift right arithmetic.
 +(define_insn "ashr<mode>3"
 +  [(set (match_operand:VWH 0 "register_operand" "=f")
 +	(ashiftrt:VWH (match_operand:VWH 1 "register_operand" "f")
 +		      (match_operand:SI 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "psra<V_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fcvt")])
 +
 +;; Shift right logical.
 +(define_insn "lshr<mode>3"
 +  [(set (match_operand:VWH 0 "register_operand" "=f")
 +	(lshiftrt:VWH (match_operand:VWH 1 "register_operand" "f")
 +		      (match_operand:SI 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "psrl<V_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fcvt")])
 +
 +;; Subtraction, treating overflow by wraparound.
 +(define_insn "sub<mode>3"
 +  [(set (match_operand:VWHB 0 "register_operand" "=f")
 +	(minus:VWHB (match_operand:VWHB 1 "register_operand" "f")
 +		    (match_operand:VWHB 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "psub<V_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fadd")])
 +
 +;; Subtraction of doubleword integers stored in FP registers.
 +;; Overflow is treated by wraparound.
 +;; See loongson_paddd for the reason we use 'unspec' rather than
 +;; 'minus' here.
 +(define_insn "loongson_psubd"
 +  [(set (match_operand:DI 0 "register_operand" "=f")
 +	(unspec:DI [(match_operand:DI 1 "register_operand" "f")
 +		    (match_operand:DI 2 "register_operand" "f")]
 +		   UNSPEC_LOONGSON_PSUBD))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "psubd\t%0,%1,%2"
 +  [(set_attr "type" "fadd")])
 +
 +;; Subtraction, treating overflow by signed saturation.
 +(define_insn "sssub<mode>3"
 +  [(set (match_operand:VHB 0 "register_operand" "=f")
 +	(ss_minus:VHB (match_operand:VHB 1 "register_operand" "f")
 +		      (match_operand:VHB 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "psubs<V_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fadd")])
 +
 +;; Subtraction, treating overflow by unsigned saturation.
 +(define_insn "ussub<mode>3"
 +  [(set (match_operand:VHB 0 "register_operand" "=f")
 +	(us_minus:VHB (match_operand:VHB 1 "register_operand" "f")
 +		      (match_operand:VHB 2 "register_operand" "f")))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "psubus<V_suffix>\t%0,%1,%2"
 +  [(set_attr "type" "fadd")])
 +
 +;; Unpack high data.  Recall that Loongson only runs in little-endian.
 +(define_insn "loongson_punpckhbh"
 +  [(set (match_operand:V8QI 0 "register_operand" "=f")
 +	(vec_select:V8QI
 +	  (vec_concat:V16QI
 +	    (match_operand:V8QI 1 "register_operand" "f")
 +	    (match_operand:V8QI 2 "register_operand" "f"))
 +	  (parallel [(const_int 4) (const_int 12)
 +		     (const_int 5) (const_int 13)
 +		     (const_int 6) (const_int 14)
 +		     (const_int 7) (const_int 15)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "punpckhbh\t%0,%1,%2"
 +  [(set_attr "type" "fdiv")])
 +
 +(define_insn "loongson_punpckhhw"
 +  [(set (match_operand:V4HI 0 "register_operand" "=f")
 +	(vec_select:V4HI
 +	  (vec_concat:V8HI
 +	    (match_operand:V4HI 1 "register_operand" "f")
 +	    (match_operand:V4HI 2 "register_operand" "f"))
 +	  (parallel [(const_int 2) (const_int 6)
 +		     (const_int 3) (const_int 7)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "punpckhhw\t%0,%1,%2"
 +  [(set_attr "type" "fdiv")])
 +
 +(define_insn "loongson_punpckhhw_qi"
 +  [(set (match_operand:V8QI 0 "register_operand" "=f")
 +	(vec_select:V8QI
 +	  (vec_concat:V16QI
 +	    (match_operand:V8QI 1 "register_operand" "f")
 +	    (match_operand:V8QI 2 "register_operand" "f"))
 +	  (parallel [(const_int 4)  (const_int 5)
 +		     (const_int 12) (const_int 13)
 +		     (const_int 6)  (const_int 7)
 +		     (const_int 14) (const_int 15)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "punpckhhw\t%0,%1,%2"
 +  [(set_attr "type" "fdiv")])
 +
 +(define_insn "loongson_punpckhwd"
 +  [(set (match_operand:V2SI 0 "register_operand" "=f")
 +	(vec_select:V2SI
 +	  (vec_concat:V4SI
 +	    (match_operand:V2SI 1 "register_operand" "f")
 +	    (match_operand:V2SI 2 "register_operand" "f"))
 +	  (parallel [(const_int 1) (const_int 3)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "punpckhwd\t%0,%1,%2"
 +  [(set_attr "type" "fcvt")])
 +
 +(define_insn "loongson_punpckhwd_qi"
 +  [(set (match_operand:V8QI 0 "register_operand" "=f")
 +	(vec_select:V8QI
 +	  (vec_concat:V16QI
 +	    (match_operand:V8QI 1 "register_operand" "f")
 +	    (match_operand:V8QI 2 "register_operand" "f"))
 +	  (parallel [(const_int 4) (const_int 5)
 +		     (const_int 6) (const_int 7)
 +		     (const_int 12) (const_int 13)
 +		     (const_int 14) (const_int 15)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "punpckhwd\t%0,%1,%2"
 +  [(set_attr "type" "fcvt")])
 +
 +(define_insn "loongson_punpckhwd_hi"
 +  [(set (match_operand:V4HI 0 "register_operand" "=f")
 +	(vec_select:V4HI
 +	  (vec_concat:V8HI
 +	    (match_operand:V4HI 1 "register_operand" "f")
 +	    (match_operand:V4HI 2 "register_operand" "f"))
 +	  (parallel [(const_int 2) (const_int 3)
 +		     (const_int 6) (const_int 7)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "punpckhwd\t%0,%1,%2"
 +  [(set_attr "type" "fcvt")])
 +
 +;; Unpack low data.
 +(define_insn "loongson_punpcklbh"
 +  [(set (match_operand:V8QI 0 "register_operand" "=f")
 +	(vec_select:V8QI
 +	  (vec_concat:V16QI
 +	    (match_operand:V8QI 1 "register_operand" "f")
 +	    (match_operand:V8QI 2 "register_operand" "f"))
 +	  (parallel [(const_int 0) (const_int 8)
 +		     (const_int 1) (const_int 9)
 +		     (const_int 2) (const_int 10)
 +		     (const_int 3) (const_int 11)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "punpcklbh\t%0,%1,%2"
 +  [(set_attr "type" "fdiv")])
 +
 +(define_insn "loongson_punpcklhw"
 +  [(set (match_operand:V4HI 0 "register_operand" "=f")
 +	(vec_select:V4HI
 +	  (vec_concat:V8HI
 +	    (match_operand:V4HI 1 "register_operand" "f")
 +	    (match_operand:V4HI 2 "register_operand" "f"))
 +	  (parallel [(const_int 0) (const_int 4)
 +		     (const_int 1) (const_int 5)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "punpcklhw\t%0,%1,%2"
 +  [(set_attr "type" "fdiv")])
 +
 +(define_insn "*loongson_punpcklhw_qi"
 +  [(set (match_operand:V8QI 0 "register_operand" "=f")
 +	(vec_select:V8QI
 +	  (vec_concat:V16QI
 +	    (match_operand:V8QI 1 "register_operand" "f")
 +	    (match_operand:V8QI 2 "register_operand" "f"))
 +	  (parallel [(const_int 0)  (const_int 1)
 +		     (const_int 8)  (const_int 9)
 +		     (const_int 2)  (const_int 3)
 +		     (const_int 10) (const_int 11)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "punpcklhw\t%0,%1,%2"
 +  [(set_attr "type" "fdiv")])
 +
 +(define_insn "loongson_punpcklwd"
 +  [(set (match_operand:V2SI 0 "register_operand" "=f")
 +	(vec_select:V2SI
 +	  (vec_concat:V4SI
 +	    (match_operand:V2SI 1 "register_operand" "f")
 +	    (match_operand:V2SI 2 "register_operand" "f"))
 +	  (parallel [(const_int 0) (const_int 2)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "punpcklwd\t%0,%1,%2"
 +  [(set_attr "type" "fcvt")])
 +
 +(define_insn "*loongson_punpcklwd_qi"
 +  [(set (match_operand:V8QI 0 "register_operand" "=f")
 +	(vec_select:V8QI
 +	  (vec_concat:V16QI
 +	    (match_operand:V8QI 1 "register_operand" "f")
 +	    (match_operand:V8QI 2 "register_operand" "f"))
 +	  (parallel [(const_int 0) (const_int 1)
 +		     (const_int 2) (const_int 3)
 +		     (const_int 8) (const_int 9)
 +		     (const_int 10) (const_int 11)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "punpcklwd\t%0,%1,%2"
 +  [(set_attr "type" "fcvt")])
 +
 +(define_insn "*loongson_punpcklwd_hi"
 +  [(set (match_operand:V4HI 0 "register_operand" "=f")
 +	(vec_select:V4HI
 +	  (vec_concat:V8HI
 +	    (match_operand:V4HI 1 "register_operand" "f")
 +	    (match_operand:V4HI 2 "register_operand" "f"))
 +	  (parallel [(const_int 0) (const_int 1)
 +		     (const_int 4) (const_int 5)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "punpcklwd\t%0,%1,%2"
 +  [(set_attr "type" "fcvt")])
 +
 +(define_expand "vec_unpacks_lo_<mode>"
 +  [(match_operand:<V_stretch_half> 0 "register_operand" "")
 +   (match_operand:VHB 1 "register_operand" "")]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +{
 +  mips_expand_vec_unpack (operands, false, false);
 +  DONE;
 +})
 +
 +(define_expand "vec_unpacks_hi_<mode>"
 +  [(match_operand:<V_stretch_half> 0 "register_operand" "")
 +   (match_operand:VHB 1 "register_operand" "")]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +{
 +  mips_expand_vec_unpack (operands, false, true);
 +  DONE;
 +})
 +
 +(define_expand "vec_unpacku_lo_<mode>"
 +  [(match_operand:<V_stretch_half> 0 "register_operand" "")
 +   (match_operand:VHB 1 "register_operand" "")]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +{
 +  mips_expand_vec_unpack (operands, true, false);
 +  DONE;
 +})
 +
 +(define_expand "vec_unpacku_hi_<mode>"
 +  [(match_operand:<V_stretch_half> 0 "register_operand" "")
 +   (match_operand:VHB 1 "register_operand" "")]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +{
 +  mips_expand_vec_unpack (operands, true, true);
 +  DONE;
 +})
 +
 +;; Whole vector shifts, used for reduction epilogues.
 +(define_insn "vec_shl_<mode>"
 +  [(set (match_operand:VWHBDI 0 "register_operand" "=f")
 +	(unspec:VWHBDI [(match_operand:VWHBDI 1 "register_operand" "f")
 +			(match_operand:SI 2 "register_operand" "f")]
 +		       UNSPEC_LOONGSON_DSLL))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "dsll\t%0,%1,%2"
 +  [(set_attr "type" "fcvt")])
 +
 +(define_insn "vec_shr_<mode>"
 +  [(set (match_operand:VWHBDI 0 "register_operand" "=f")
 +	(unspec:VWHBDI [(match_operand:VWHBDI 1 "register_operand" "f")
 +			(match_operand:SI 2 "register_operand" "f")]
 +		       UNSPEC_LOONGSON_DSRL))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "dsrl\t%0,%1,%2"
 +  [(set_attr "type" "fcvt")])
 +
 +(define_insn "vec_loongson_extract_lo_<mode>"
 +  [(set (match_operand:<V_inner> 0 "register_operand" "=r")
 +	(vec_select:<V_inner>
 +	  (match_operand:VWHB 1 "register_operand" "f")
 +	  (parallel [(const_int 0)])))]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +  "mfc1\t%0,%1"
 +  [(set_attr "type" "mfc")])
 +
 +(define_expand "reduc_plus_scal_<mode>"
 +  [(match_operand:<V_inner> 0 "register_operand" "")
 +   (match_operand:VWHB 1 "register_operand" "")]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +{
 +  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
 +  mips_expand_vec_reduc (tmp, operands[1], gen_add<mode>3);
 +  emit_insn (gen_vec_loongson_extract_lo_<mode> (operands[0], tmp));
 +  DONE;
 +})
 +
 +(define_expand "reduc_smax_scal_<mode>"
 +  [(match_operand:<V_inner> 0 "register_operand" "")
 +   (match_operand:VWHB 1 "register_operand" "")]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +{
 +  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
 +  mips_expand_vec_reduc (tmp, operands[1], gen_smax<mode>3);
 +  emit_insn (gen_vec_loongson_extract_lo_<mode> (operands[0], tmp));
 +  DONE;
 +})
 +
 +(define_expand "reduc_smin_scal_<mode>"
 +  [(match_operand:<V_inner> 0 "register_operand" "")
 +   (match_operand:VWHB 1 "register_operand" "")]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +{
 +  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
 +  mips_expand_vec_reduc (tmp, operands[1], gen_smin<mode>3);
 +  emit_insn (gen_vec_loongson_extract_lo_<mode> (operands[0], tmp));
 +  DONE;
 +})
 +
 +(define_expand "reduc_umax_scal_<mode>"
 +  [(match_operand:<V_inner> 0 "register_operand" "")
 +   (match_operand:VB 1 "register_operand" "")]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +{
 +  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
 +  mips_expand_vec_reduc (tmp, operands[1], gen_umax<mode>3);
 +  emit_insn (gen_vec_loongson_extract_lo_<mode> (operands[0], tmp));
 +  DONE;
 +})
 +
 +(define_expand "reduc_umin_scal_<mode>"
 +  [(match_operand:<V_inner> 0 "register_operand" "")
 +   (match_operand:VB 1 "register_operand" "")]
 +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI"
 +{
 +  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
 +  mips_expand_vec_reduc (tmp, operands[1], gen_umin<mode>3);
 +  emit_insn (gen_vec_loongson_extract_lo_<mode> (operands[0], tmp));
 +  DONE;
 +})
 diff --git a/gcc/config/mips/loongson-mmiintrin.h b/gcc/config/mips/loongson-mmiintrin.h
 new file mode 100644
 index 0000000..6f35fb5
 --- /dev/null
 +++ b/gcc/config/mips/loongson-mmiintrin.h
 @@ -0,0 +1,691 @@
 +/* Intrinsics for Loongson MultiMedia extension Instructions operations.
 +
 +   Copyright (C) 2008-2018 Free Software Foundation, Inc.
 +   Contributed by CodeSourcery.
 +
 +   This file is part of GCC.
 +
 +   GCC is free software; you can redistribute it and/or modify it
 +   under the terms of the GNU General Public License as published
 +   by the Free Software Foundation; either version 3, or (at your
 +   option) any later version.
 +
 +   GCC is distributed in the hope that it will be useful, but WITHOUT
 +   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 +   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
 +   License for more details.
 +
 +   Under Section 7 of GPL version 3, you are granted additional
 +   permissions described in the GCC Runtime Library Exception, version
 +   3.1, as published by the Free Software Foundation.
 +
 +   You should have received a copy of the GNU General Public License and
 +   a copy of the GCC Runtime Library Exception along with this program;
 +   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#ifndef _GCC_LOONGSON_MMIINTRIN_H
 +#define _GCC_LOONGSON_MMIINTRIN_H
 +
 +#if !defined(__mips_loongson_mmi)
 +# error "You must select -mloongson-mmi or -march=loongson2e/2f/3a to use
 +	 loongson-mmiintrin.h"
 +#endif
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +#include <stdint.h>
 +
 +/* Vectors of unsigned bytes, halfwords and words.  */
 +typedef uint8_t uint8x8_t __attribute__((vector_size (8)));
 +typedef uint16_t uint16x4_t __attribute__((vector_size (8)));
 +typedef uint32_t uint32x2_t __attribute__((vector_size (8)));
 +
 +/* Vectors of signed bytes, halfwords and words.  */
 +typedef int8_t int8x8_t __attribute__((vector_size (8)));
 +typedef int16_t int16x4_t __attribute__((vector_size (8)));
 +typedef int32_t int32x2_t __attribute__((vector_size (8)));
 +
 +/* SIMD intrinsics.
 +   Unless otherwise noted, calls to the functions below will expand into
 +   precisely one machine instruction, modulo any moves required to
 +   satisfy register allocation constraints.  */
 +
 +/* Pack with signed saturation.  */
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +packsswh (int32x2_t s, int32x2_t t)
 +{
 +  return __builtin_loongson_packsswh (s, t);
 +}
 +
 +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 +packsshb (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_packsshb (s, t);
 +}
 +
 +/* Pack with unsigned saturation.  */
 +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 +packushb (uint16x4_t s, uint16x4_t t)
 +{
 +  return __builtin_loongson_packushb (s, t);
 +}
 +
 +/* Vector addition, treating overflow by wraparound.  */
 +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 +paddw_u (uint32x2_t s, uint32x2_t t)
 +{
 +  return __builtin_loongson_paddw_u (s, t);
 +}
 +
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +paddh_u (uint16x4_t s, uint16x4_t t)
 +{
 +  return __builtin_loongson_paddh_u (s, t);
 +}
 +
 +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 +paddb_u (uint8x8_t s, uint8x8_t t)
 +{
 +  return __builtin_loongson_paddb_u (s, t);
 +}
 +
 +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 +paddw_s (int32x2_t s, int32x2_t t)
 +{
 +  return __builtin_loongson_paddw_s (s, t);
 +}
 +
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +paddh_s (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_paddh_s (s, t);
 +}
 +
 +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 +paddb_s (int8x8_t s, int8x8_t t)
 +{
 +  return __builtin_loongson_paddb_s (s, t);
 +}
 +
 +/* Addition of doubleword integers, treating overflow by wraparound.  */
 +__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 +paddd_u (uint64_t s, uint64_t t)
 +{
 +  return __builtin_loongson_paddd_u (s, t);
 +}
 +
 +__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 +paddd_s (int64_t s, int64_t t)
 +{
 +  return __builtin_loongson_paddd_s (s, t);
 +}
 +
 +/* Vector addition, treating overflow by signed saturation.  */
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +paddsh (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_paddsh (s, t);
 +}
 +
 +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 +paddsb (int8x8_t s, int8x8_t t)
 +{
 +  return __builtin_loongson_paddsb (s, t);
 +}
 +
 +/* Vector addition, treating overflow by unsigned saturation.  */
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +paddush (uint16x4_t s, uint16x4_t t)
 +{
 +  return __builtin_loongson_paddush (s, t);
 +}
 +
 +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 +paddusb (uint8x8_t s, uint8x8_t t)
 +{
 +  return __builtin_loongson_paddusb (s, t);
 +}
 +
 +/* Logical AND NOT.  */
 +__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 +pandn_ud (uint64_t s, uint64_t t)
 +{
 +  return __builtin_loongson_pandn_ud (s, t);
 +}
 +
 +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 +pandn_uw (uint32x2_t s, uint32x2_t t)
 +{
 +  return __builtin_loongson_pandn_uw (s, t);
 +}
 +
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +pandn_uh (uint16x4_t s, uint16x4_t t)
 +{
 +  return __builtin_loongson_pandn_uh (s, t);
 +}
 +
 +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 +pandn_ub (uint8x8_t s, uint8x8_t t)
 +{
 +  return __builtin_loongson_pandn_ub (s, t);
 +}
 +
 +__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 +pandn_sd (int64_t s, int64_t t)
 +{
 +  return __builtin_loongson_pandn_sd (s, t);
 +}
 +
 +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 +pandn_sw (int32x2_t s, int32x2_t t)
 +{
 +  return __builtin_loongson_pandn_sw (s, t);
 +}
 +
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +pandn_sh (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_pandn_sh (s, t);
 +}
 +
 +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 +pandn_sb (int8x8_t s, int8x8_t t)
 +{
 +  return __builtin_loongson_pandn_sb (s, t);
 +}
 +
 +/* Average.  */
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +pavgh (uint16x4_t s, uint16x4_t t)
 +{
 +  return __builtin_loongson_pavgh (s, t);
 +}
 +
 +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 +pavgb (uint8x8_t s, uint8x8_t t)
 +{
 +  return __builtin_loongson_pavgb (s, t);
 +}
 +
 +/* Equality test.  */
 +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 +pcmpeqw_u (uint32x2_t s, uint32x2_t t)
 +{
 +  return __builtin_loongson_pcmpeqw_u (s, t);
 +}
 +
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +pcmpeqh_u (uint16x4_t s, uint16x4_t t)
 +{
 +  return __builtin_loongson_pcmpeqh_u (s, t);
 +}
 +
 +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 +pcmpeqb_u (uint8x8_t s, uint8x8_t t)
 +{
 +  return __builtin_loongson_pcmpeqb_u (s, t);
 +}
 +
 +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 +pcmpeqw_s (int32x2_t s, int32x2_t t)
 +{
 +  return __builtin_loongson_pcmpeqw_s (s, t);
 +}
 +
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +pcmpeqh_s (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_pcmpeqh_s (s, t);
 +}
 +
 +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 +pcmpeqb_s (int8x8_t s, int8x8_t t)
 +{
 +  return __builtin_loongson_pcmpeqb_s (s, t);
 +}
 +
 +/* Greater-than test.  */
 +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 +pcmpgtw_u (uint32x2_t s, uint32x2_t t)
 +{
 +  return __builtin_loongson_pcmpgtw_u (s, t);
 +}
 +
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +pcmpgth_u (uint16x4_t s, uint16x4_t t)
 +{
 +  return __builtin_loongson_pcmpgth_u (s, t);
 +}
 +
 +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 +pcmpgtb_u (uint8x8_t s, uint8x8_t t)
 +{
 +  return __builtin_loongson_pcmpgtb_u (s, t);
 +}
 +
 +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 +pcmpgtw_s (int32x2_t s, int32x2_t t)
 +{
 +  return __builtin_loongson_pcmpgtw_s (s, t);
 +}
 +
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +pcmpgth_s (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_pcmpgth_s (s, t);
 +}
 +
 +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 +pcmpgtb_s (int8x8_t s, int8x8_t t)
 +{
 +  return __builtin_loongson_pcmpgtb_s (s, t);
 +}
 +
 +/* Extract halfword.  */
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +pextrh_u (uint16x4_t s, int field /* 0--3.  */)
 +{
 +  return __builtin_loongson_pextrh_u (s, field);
 +}
 +
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +pextrh_s (int16x4_t s, int field /* 0--3.  */)
 +{
 +  return __builtin_loongson_pextrh_s (s, field);
 +}
 +
 +/* Insert halfword.  */
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +pinsrh_0_u (uint16x4_t s, uint16x4_t t)
 +{
 +  return __builtin_loongson_pinsrh_0_u (s, t);
 +}
 +
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +pinsrh_1_u (uint16x4_t s, uint16x4_t t)
 +{
 +  return __builtin_loongson_pinsrh_1_u (s, t);
 +}
 +
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +pinsrh_2_u (uint16x4_t s, uint16x4_t t)
 +{
 +  return __builtin_loongson_pinsrh_2_u (s, t);
 +}
 +
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +pinsrh_3_u (uint16x4_t s, uint16x4_t t)
 +{
 +  return __builtin_loongson_pinsrh_3_u (s, t);
 +}
 +
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +pinsrh_0_s (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_pinsrh_0_s (s, t);
 +}
 +
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +pinsrh_1_s (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_pinsrh_1_s (s, t);
 +}
 +
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +pinsrh_2_s (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_pinsrh_2_s (s, t);
 +}
 +
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +pinsrh_3_s (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_pinsrh_3_s (s, t);
 +}
 +
 +/* Multiply and add.  */
 +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 +pmaddhw (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_pmaddhw (s, t);
 +}
 +
 +/* Maximum of signed halfwords.  */
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +pmaxsh (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_pmaxsh (s, t);
 +}
 +
 +/* Maximum of unsigned bytes.  */
 +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 +pmaxub (uint8x8_t s, uint8x8_t t)
 +{
 +  return __builtin_loongson_pmaxub (s, t);
 +}
 +
 +/* Minimum of signed halfwords.  */
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +pminsh (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_pminsh (s, t);
 +}
 +
 +/* Minimum of unsigned bytes.  */
 +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 +pminub (uint8x8_t s, uint8x8_t t)
 +{
 +  return __builtin_loongson_pminub (s, t);
 +}
 +
 +/* Move byte mask.  */
 +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 +pmovmskb_u (uint8x8_t s)
 +{
 +  return __builtin_loongson_pmovmskb_u (s);
 +}
 +
 +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 +pmovmskb_s (int8x8_t s)
 +{
 +  return __builtin_loongson_pmovmskb_s (s);
 +}
 +
 +/* Multiply unsigned integers and store high result.  */
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +pmulhuh (uint16x4_t s, uint16x4_t t)
 +{
 +  return __builtin_loongson_pmulhuh (s, t);
 +}
 +
 +/* Multiply signed integers and store high result.  */
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +pmulhh (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_pmulhh (s, t);
 +}
 +
 +/* Multiply signed integers and store low result.  */
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +pmullh (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_pmullh (s, t);
 +}
 +
 +/* Multiply unsigned word integers.  */
 +__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 +pmuluw (uint32x2_t s, uint32x2_t t)
 +{
 +  return __builtin_loongson_pmuluw (s, t);
 +}
 +
 +/* Absolute difference.  */
 +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 +pasubub (uint8x8_t s, uint8x8_t t)
 +{
 +  return __builtin_loongson_pasubub (s, t);
 +}
 +
 +/* Sum of unsigned byte integers.  */
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +biadd (uint8x8_t s)
 +{
 +  return __builtin_loongson_biadd (s);
 +}
 +
 +/* Sum of absolute differences.
 +   Note that this intrinsic expands into two machine instructions:
 +   PASUBUB followed by BIADD.  */
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +psadbh (uint8x8_t s, uint8x8_t t)
 +{
 +  return __builtin_loongson_psadbh (s, t);
 +}
 +
 +/* Shuffle halfwords.  */
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +pshufh_u (uint16x4_t dest, uint16x4_t s, uint8_t order)
 +{
 +  return __builtin_loongson_pshufh_u (s, order);
 +}
 +
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +pshufh_s (int16x4_t dest, int16x4_t s, uint8_t order)
 +{
 +  return __builtin_loongson_pshufh_s (s, order);
 +}
 +
 +/* Shift left logical.  */
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +psllh_u (uint16x4_t s, uint8_t amount)
 +{
 +  return __builtin_loongson_psllh_u (s, amount);
 +}
 +
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +psllh_s (int16x4_t s, uint8_t amount)
 +{
 +  return __builtin_loongson_psllh_s (s, amount);
 +}
 +
 +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 +psllw_u (uint32x2_t s, uint8_t amount)
 +{
 +  return __builtin_loongson_psllw_u (s, amount);
 +}
 +
 +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 +psllw_s (int32x2_t s, uint8_t amount)
 +{
 +  return __builtin_loongson_psllw_s (s, amount);
 +}
 +
 +/* Shift right logical.  */
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +psrlh_u (uint16x4_t s, uint8_t amount)
 +{
 +  return __builtin_loongson_psrlh_u (s, amount);
 +}
 +
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +psrlh_s (int16x4_t s, uint8_t amount)
 +{
 +  return __builtin_loongson_psrlh_s (s, amount);
 +}
 +
 +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 +psrlw_u (uint32x2_t s, uint8_t amount)
 +{
 +  return __builtin_loongson_psrlw_u (s, amount);
 +}
 +
 +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 +psrlw_s (int32x2_t s, uint8_t amount)
 +{
 +  return __builtin_loongson_psrlw_s (s, amount);
 +}
 +
 +/* Shift right arithmetic.  */
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +psrah_u (uint16x4_t s, uint8_t amount)
 +{
 +  return __builtin_loongson_psrah_u (s, amount);
 +}
 +
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +psrah_s (int16x4_t s, uint8_t amount)
 +{
 +  return __builtin_loongson_psrah_s (s, amount);
 +}
 +
 +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 +psraw_u (uint32x2_t s, uint8_t amount)
 +{
 +  return __builtin_loongson_psraw_u (s, amount);
 +}
 +
 +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 +psraw_s (int32x2_t s, uint8_t amount)
 +{
 +  return __builtin_loongson_psraw_s (s, amount);
 +}
 +
 +/* Vector subtraction, treating overflow by wraparound.  */
 +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 +psubw_u (uint32x2_t s, uint32x2_t t)
 +{
 +  return __builtin_loongson_psubw_u (s, t);
 +}
 +
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +psubh_u (uint16x4_t s, uint16x4_t t)
 +{
 +  return __builtin_loongson_psubh_u (s, t);
 +}
 +
 +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 +psubb_u (uint8x8_t s, uint8x8_t t)
 +{
 +  return __builtin_loongson_psubb_u (s, t);
 +}
 +
 +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 +psubw_s (int32x2_t s, int32x2_t t)
 +{
 +  return __builtin_loongson_psubw_s (s, t);
 +}
 +
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +psubh_s (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_psubh_s (s, t);
 +}
 +
 +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 +psubb_s (int8x8_t s, int8x8_t t)
 +{
 +  return __builtin_loongson_psubb_s (s, t);
 +}
 +
 +/* Subtraction of doubleword integers, treating overflow by wraparound.  */
 +__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 +psubd_u (uint64_t s, uint64_t t)
 +{
 +  return __builtin_loongson_psubd_u (s, t);
 +}
 +
 +__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 +psubd_s (int64_t s, int64_t t)
 +{
 +  return __builtin_loongson_psubd_s (s, t);
 +}
 +
 +/* Vector subtraction, treating overflow by signed saturation.  */
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +psubsh (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_psubsh (s, t);
 +}
 +
 +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 +psubsb (int8x8_t s, int8x8_t t)
 +{
 +  return __builtin_loongson_psubsb (s, t);
 +}
 +
 +/* Vector subtraction, treating overflow by unsigned saturation.  */
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +psubush (uint16x4_t s, uint16x4_t t)
 +{
 +  return __builtin_loongson_psubush (s, t);
 +}
 +
 +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 +psubusb (uint8x8_t s, uint8x8_t t)
 +{
 +  return __builtin_loongson_psubusb (s, t);
 +}
 +
 +/* Unpack high data.  */
 +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 +punpckhwd_u (uint32x2_t s, uint32x2_t t)
 +{
 +  return __builtin_loongson_punpckhwd_u (s, t);
 +}
 +
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +punpckhhw_u (uint16x4_t s, uint16x4_t t)
 +{
 +  return __builtin_loongson_punpckhhw_u (s, t);
 +}
 +
 +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 +punpckhbh_u (uint8x8_t s, uint8x8_t t)
 +{
 +  return __builtin_loongson_punpckhbh_u (s, t);
 +}
 +
 +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 +punpckhwd_s (int32x2_t s, int32x2_t t)
 +{
 +  return __builtin_loongson_punpckhwd_s (s, t);
 +}
 +
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +punpckhhw_s (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_punpckhhw_s (s, t);
 +}
 +
 +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 +punpckhbh_s (int8x8_t s, int8x8_t t)
 +{
 +  return __builtin_loongson_punpckhbh_s (s, t);
 +}
 +
 +/* Unpack low data.  */
 +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 +punpcklwd_u (uint32x2_t s, uint32x2_t t)
 +{
 +  return __builtin_loongson_punpcklwd_u (s, t);
 +}
 +
 +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 +punpcklhw_u (uint16x4_t s, uint16x4_t t)
 +{
 +  return __builtin_loongson_punpcklhw_u (s, t);
 +}
 +
 +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 +punpcklbh_u (uint8x8_t s, uint8x8_t t)
 +{
 +  return __builtin_loongson_punpcklbh_u (s, t);
 +}
 +
 +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 +punpcklwd_s (int32x2_t s, int32x2_t t)
 +{
 +  return __builtin_loongson_punpcklwd_s (s, t);
 +}
 +
 +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 +punpcklhw_s (int16x4_t s, int16x4_t t)
 +{
 +  return __builtin_loongson_punpcklhw_s (s, t);
 +}
 +
 +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 +punpcklbh_s (int8x8_t s, int8x8_t t)
 +{
 +  return __builtin_loongson_punpcklbh_s (s, t);
 +}
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
 diff --git a/gcc/config/mips/loongson.h b/gcc/config/mips/loongson.h
 index 3a99878..3d0c26b 100644
 --- a/gcc/config/mips/loongson.h
 +++ b/gcc/config/mips/loongson.h
 @@ -1,4 +1,4 @@
 -/* Intrinsics for ST Microelectronics Loongson-2E/2F SIMD operations.
 +/* Intrinsics for Loongson MultiMedia extension Instructions operations.
 
    Copyright (C) 2008-2018 Free Software Foundation, Inc.
    Contributed by CodeSourcery.
 @@ -24,2701 +24,9 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
 -#ifndef _GCC_LOONGSON_H
 -#define _GCC_LOONGSON_H
 -
 -#if !defined(__mips_loongson_vector_rev)
 -# error "You must select -march=loongson2e/2f/3a to use loongson.h"
 -#endif
 -
 -#ifdef __cplusplus
 -extern "C" {
 -#endif
 -
 -#include <stdint.h>
 -
 -/* Vectors of unsigned bytes, halfwords and words.  */
 -typedef uint8_t uint8x8_t __attribute__((vector_size (8)));
 -typedef uint16_t uint16x4_t __attribute__((vector_size (8)));
 -typedef uint32_t uint32x2_t __attribute__((vector_size (8)));
 -
 -/* Vectors of signed bytes, halfwords and words.  */
 -typedef int8_t int8x8_t __attribute__((vector_size (8)));
 -typedef int16_t int16x4_t __attribute__((vector_size (8)));
 -typedef int32_t int32x2_t __attribute__((vector_size (8)));
 -
 -/* SIMD intrinsics.
 -   Unless otherwise noted, calls to the functions below will expand into
 -   precisely one machine instruction, modulo any moves required to
 -   satisfy register allocation constraints.  */
 -
 -/* Pack with signed saturation.  */
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -packsswh (int32x2_t s, int32x2_t t)
 -{
 -  return __builtin_loongson_packsswh (s, t);
 -}
 -
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -packsshb (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_packsshb (s, t);
 -}
 -
 -/* Pack with unsigned saturation.  */
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -packushb (uint16x4_t s, uint16x4_t t)
 -{
 -  return __builtin_loongson_packushb (s, t);
 -}
 -
 -/* Vector addition, treating overflow by wraparound.  */
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -paddw_u (uint32x2_t s, uint32x2_t t)
 -{
 -  return __builtin_loongson_paddw_u (s, t);
 -}
 -
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -paddh_u (uint16x4_t s, uint16x4_t t)
 -{
 -  return __builtin_loongson_paddh_u (s, t);
 -}
 -
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -paddb_u (uint8x8_t s, uint8x8_t t)
 -{
 -  return __builtin_loongson_paddb_u (s, t);
 -}
 -
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -paddw_s (int32x2_t s, int32x2_t t)
 -{
 -  return __builtin_loongson_paddw_s (s, t);
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -paddh_s (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_paddh_s (s, t);
 -}
 -
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -paddb_s (int8x8_t s, int8x8_t t)
 -{
 -  return __builtin_loongson_paddb_s (s, t);
 -}
 -
 -/* Addition of doubleword integers, treating overflow by wraparound.  */
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -paddd_u (uint64_t s, uint64_t t)
 -{
 -  return __builtin_loongson_paddd_u (s, t);
 -}
 -
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -paddd_s (int64_t s, int64_t t)
 -{
 -  return __builtin_loongson_paddd_s (s, t);
 -}
 -
 -/* Vector addition, treating overflow by signed saturation.  */
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -paddsh (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_paddsh (s, t);
 -}
 -
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -paddsb (int8x8_t s, int8x8_t t)
 -{
 -  return __builtin_loongson_paddsb (s, t);
 -}
 -
 -/* Vector addition, treating overflow by unsigned saturation.  */
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -paddush (uint16x4_t s, uint16x4_t t)
 -{
 -  return __builtin_loongson_paddush (s, t);
 -}
 -
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -paddusb (uint8x8_t s, uint8x8_t t)
 -{
 -  return __builtin_loongson_paddusb (s, t);
 -}
 -
 -/* Logical AND NOT.  */
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -pandn_ud (uint64_t s, uint64_t t)
 -{
 -  return __builtin_loongson_pandn_ud (s, t);
 -}
 -
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -pandn_uw (uint32x2_t s, uint32x2_t t)
 -{
 -  return __builtin_loongson_pandn_uw (s, t);
 -}
 -
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -pandn_uh (uint16x4_t s, uint16x4_t t)
 -{
 -  return __builtin_loongson_pandn_uh (s, t);
 -}
 -
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -pandn_ub (uint8x8_t s, uint8x8_t t)
 -{
 -  return __builtin_loongson_pandn_ub (s, t);
 -}
 -
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -pandn_sd (int64_t s, int64_t t)
 -{
 -  return __builtin_loongson_pandn_sd (s, t);
 -}
 -
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -pandn_sw (int32x2_t s, int32x2_t t)
 -{
 -  return __builtin_loongson_pandn_sw (s, t);
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -pandn_sh (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_pandn_sh (s, t);
 -}
 -
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -pandn_sb (int8x8_t s, int8x8_t t)
 -{
 -  return __builtin_loongson_pandn_sb (s, t);
 -}
 -
 -/* Average.  */
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -pavgh (uint16x4_t s, uint16x4_t t)
 -{
 -  return __builtin_loongson_pavgh (s, t);
 -}
 -
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -pavgb (uint8x8_t s, uint8x8_t t)
 -{
 -  return __builtin_loongson_pavgb (s, t);
 -}
 -
 -/* Equality test.  */
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -pcmpeqw_u (uint32x2_t s, uint32x2_t t)
 -{
 -  return __builtin_loongson_pcmpeqw_u (s, t);
 -}
 -
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -pcmpeqh_u (uint16x4_t s, uint16x4_t t)
 -{
 -  return __builtin_loongson_pcmpeqh_u (s, t);
 -}
 -
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -pcmpeqb_u (uint8x8_t s, uint8x8_t t)
 -{
 -  return __builtin_loongson_pcmpeqb_u (s, t);
 -}
 -
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -pcmpeqw_s (int32x2_t s, int32x2_t t)
 -{
 -  return __builtin_loongson_pcmpeqw_s (s, t);
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -pcmpeqh_s (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_pcmpeqh_s (s, t);
 -}
 -
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -pcmpeqb_s (int8x8_t s, int8x8_t t)
 -{
 -  return __builtin_loongson_pcmpeqb_s (s, t);
 -}
 -
 -/* Greater-than test.  */
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -pcmpgtw_u (uint32x2_t s, uint32x2_t t)
 -{
 -  return __builtin_loongson_pcmpgtw_u (s, t);
 -}
 -
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -pcmpgth_u (uint16x4_t s, uint16x4_t t)
 -{
 -  return __builtin_loongson_pcmpgth_u (s, t);
 -}
 -
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -pcmpgtb_u (uint8x8_t s, uint8x8_t t)
 -{
 -  return __builtin_loongson_pcmpgtb_u (s, t);
 -}
 -
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -pcmpgtw_s (int32x2_t s, int32x2_t t)
 -{
 -  return __builtin_loongson_pcmpgtw_s (s, t);
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -pcmpgth_s (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_pcmpgth_s (s, t);
 -}
 -
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -pcmpgtb_s (int8x8_t s, int8x8_t t)
 -{
 -  return __builtin_loongson_pcmpgtb_s (s, t);
 -}
 -
 -/* Extract halfword.  */
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -pextrh_u (uint16x4_t s, int field /* 0--3 */)
 -{
 -  return __builtin_loongson_pextrh_u (s, field);
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -pextrh_s (int16x4_t s, int field /* 0--3 */)
 -{
 -  return __builtin_loongson_pextrh_s (s, field);
 -}
 -
 -/* Insert halfword.  */
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -pinsrh_0_u (uint16x4_t s, uint16x4_t t)
 -{
 -  return __builtin_loongson_pinsrh_0_u (s, t);
 -}
 -
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -pinsrh_1_u (uint16x4_t s, uint16x4_t t)
 -{
 -  return __builtin_loongson_pinsrh_1_u (s, t);
 -}
 -
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -pinsrh_2_u (uint16x4_t s, uint16x4_t t)
 -{
 -  return __builtin_loongson_pinsrh_2_u (s, t);
 -}
 -
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -pinsrh_3_u (uint16x4_t s, uint16x4_t t)
 -{
 -  return __builtin_loongson_pinsrh_3_u (s, t);
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -pinsrh_0_s (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_pinsrh_0_s (s, t);
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -pinsrh_1_s (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_pinsrh_1_s (s, t);
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -pinsrh_2_s (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_pinsrh_2_s (s, t);
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -pinsrh_3_s (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_pinsrh_3_s (s, t);
 -}
 -
 -/* Multiply and add.  */
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -pmaddhw (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_pmaddhw (s, t);
 -}
 -
 -/* Maximum of signed halfwords.  */
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -pmaxsh (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_pmaxsh (s, t);
 -}
 -
 -/* Maximum of unsigned bytes.  */
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -pmaxub (uint8x8_t s, uint8x8_t t)
 -{
 -  return __builtin_loongson_pmaxub (s, t);
 -}
 -
 -/* Minimum of signed halfwords.  */
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -pminsh (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_pminsh (s, t);
 -}
 -
 -/* Minimum of unsigned bytes.  */
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -pminub (uint8x8_t s, uint8x8_t t)
 -{
 -  return __builtin_loongson_pminub (s, t);
 -}
 -
 -/* Move byte mask.  */
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -pmovmskb_u (uint8x8_t s)
 -{
 -  return __builtin_loongson_pmovmskb_u (s);
 -}
 -
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -pmovmskb_s (int8x8_t s)
 -{
 -  return __builtin_loongson_pmovmskb_s (s);
 -}
 -
 -/* Multiply unsigned integers and store high result.  */
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -pmulhuh (uint16x4_t s, uint16x4_t t)
 -{
 -  return __builtin_loongson_pmulhuh (s, t);
 -}
 -
 -/* Multiply signed integers and store high result.  */
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -pmulhh (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_pmulhh (s, t);
 -}
 -
 -/* Multiply signed integers and store low result.  */
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -pmullh (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_pmullh (s, t);
 -}
 -
 -/* Multiply unsigned word integers.  */
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -pmuluw (uint32x2_t s, uint32x2_t t)
 -{
 -  return __builtin_loongson_pmuluw (s, t);
 -}
 -
 -/* Absolute difference.  */
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -pasubub (uint8x8_t s, uint8x8_t t)
 -{
 -  return __builtin_loongson_pasubub (s, t);
 -}
 -
 -/* Sum of unsigned byte integers.  */
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -biadd (uint8x8_t s)
 -{
 -  return __builtin_loongson_biadd (s);
 -}
 -
 -/* Sum of absolute differences.
 -   Note that this intrinsic expands into two machine instructions:
 -   PASUBUB followed by BIADD.  */
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -psadbh (uint8x8_t s, uint8x8_t t)
 -{
 -  return __builtin_loongson_psadbh (s, t);
 -}
 -
 -/* Shuffle halfwords.  */
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -pshufh_u (uint16x4_t dest, uint16x4_t s, uint8_t order)
 -{
 -  return __builtin_loongson_pshufh_u (s, order);
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -pshufh_s (int16x4_t dest, int16x4_t s, uint8_t order)
 -{
 -  return __builtin_loongson_pshufh_s (s, order);
 -}
 -
 -/* Shift left logical.  */
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -psllh_u (uint16x4_t s, uint8_t amount)
 -{
 -  return __builtin_loongson_psllh_u (s, amount);
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -psllh_s (int16x4_t s, uint8_t amount)
 -{
 -  return __builtin_loongson_psllh_s (s, amount);
 -}
 -
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -psllw_u (uint32x2_t s, uint8_t amount)
 -{
 -  return __builtin_loongson_psllw_u (s, amount);
 -}
 -
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -psllw_s (int32x2_t s, uint8_t amount)
 -{
 -  return __builtin_loongson_psllw_s (s, amount);
 -}
 -
 -/* Shift right logical.  */
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -psrlh_u (uint16x4_t s, uint8_t amount)
 -{
 -  return __builtin_loongson_psrlh_u (s, amount);
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -psrlh_s (int16x4_t s, uint8_t amount)
 -{
 -  return __builtin_loongson_psrlh_s (s, amount);
 -}
 -
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -psrlw_u (uint32x2_t s, uint8_t amount)
 -{
 -  return __builtin_loongson_psrlw_u (s, amount);
 -}
 -
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -psrlw_s (int32x2_t s, uint8_t amount)
 -{
 -  return __builtin_loongson_psrlw_s (s, amount);
 -}
 -
 -/* Shift right arithmetic.  */
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -psrah_u (uint16x4_t s, uint8_t amount)
 -{
 -  return __builtin_loongson_psrah_u (s, amount);
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -psrah_s (int16x4_t s, uint8_t amount)
 -{
 -  return __builtin_loongson_psrah_s (s, amount);
 -}
 -
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -psraw_u (uint32x2_t s, uint8_t amount)
 -{
 -  return __builtin_loongson_psraw_u (s, amount);
 -}
 -
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -psraw_s (int32x2_t s, uint8_t amount)
 -{
 -  return __builtin_loongson_psraw_s (s, amount);
 -}
 -
 -/* Vector subtraction, treating overflow by wraparound.  */
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -psubw_u (uint32x2_t s, uint32x2_t t)
 -{
 -  return __builtin_loongson_psubw_u (s, t);
 -}
 -
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -psubh_u (uint16x4_t s, uint16x4_t t)
 -{
 -  return __builtin_loongson_psubh_u (s, t);
 -}
 -
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -psubb_u (uint8x8_t s, uint8x8_t t)
 -{
 -  return __builtin_loongson_psubb_u (s, t);
 -}
 -
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -psubw_s (int32x2_t s, int32x2_t t)
 -{
 -  return __builtin_loongson_psubw_s (s, t);
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -psubh_s (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_psubh_s (s, t);
 -}
 -
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -psubb_s (int8x8_t s, int8x8_t t)
 -{
 -  return __builtin_loongson_psubb_s (s, t);
 -}
 -
 -/* Subtraction of doubleword integers, treating overflow by wraparound.  */
 -__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 -psubd_u (uint64_t s, uint64_t t)
 -{
 -  return __builtin_loongson_psubd_u (s, t);
 -}
 -
 -__extension__ static __inline int64_t __attribute__ ((__always_inline__))
 -psubd_s (int64_t s, int64_t t)
 -{
 -  return __builtin_loongson_psubd_s (s, t);
 -}
 -
 -/* Vector subtraction, treating overflow by signed saturation.  */
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -psubsh (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_psubsh (s, t);
 -}
 -
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -psubsb (int8x8_t s, int8x8_t t)
 -{
 -  return __builtin_loongson_psubsb (s, t);
 -}
 -
 -/* Vector subtraction, treating overflow by unsigned saturation.  */
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -psubush (uint16x4_t s, uint16x4_t t)
 -{
 -  return __builtin_loongson_psubush (s, t);
 -}
 -
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -psubusb (uint8x8_t s, uint8x8_t t)
 -{
 -  return __builtin_loongson_psubusb (s, t);
 -}
 -
 -/* Unpack high data.  */
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -punpckhwd_u (uint32x2_t s, uint32x2_t t)
 -{
 -  return __builtin_loongson_punpckhwd_u (s, t);
 -}
 -
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -punpckhhw_u (uint16x4_t s, uint16x4_t t)
 -{
 -  return __builtin_loongson_punpckhhw_u (s, t);
 -}
 -
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -punpckhbh_u (uint8x8_t s, uint8x8_t t)
 -{
 -  return __builtin_loongson_punpckhbh_u (s, t);
 -}
 -
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -punpckhwd_s (int32x2_t s, int32x2_t t)
 -{
 -  return __builtin_loongson_punpckhwd_s (s, t);
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -punpckhhw_s (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_punpckhhw_s (s, t);
 -}
 -
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -punpckhbh_s (int8x8_t s, int8x8_t t)
 -{
 -  return __builtin_loongson_punpckhbh_s (s, t);
 -}
 -
 -/* Unpack low data.  */
 -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 -punpcklwd_u (uint32x2_t s, uint32x2_t t)
 -{
 -  return __builtin_loongson_punpcklwd_u (s, t);
 -}
 -
 -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 -punpcklhw_u (uint16x4_t s, uint16x4_t t)
 -{
 -  return __builtin_loongson_punpcklhw_u (s, t);
 -}
 -
 -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 -punpcklbh_u (uint8x8_t s, uint8x8_t t)
 -{
 -  return __builtin_loongson_punpcklbh_u (s, t);
 -}
 -
 -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 -punpcklwd_s (int32x2_t s, int32x2_t t)
 -{
 -  return __builtin_loongson_punpcklwd_s (s, t);
 -}
 -
 -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 -punpcklhw_s (int16x4_t s, int16x4_t t)
 -{
 -  return __builtin_loongson_punpcklhw_s (s, t);
 -}
 -
 -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 -punpcklbh_s (int8x8_t s, int8x8_t t)
 -{
 -  return __builtin_loongson_punpcklbh_s (s, t);
 -}
 -
 -/* SSE2-style Vectors */
 -typedef double              __v1df __attribute__ ((__vector_size__ (8)));
 -typedef long long           __v1di __attribute__ ((__vector_size__ (8)));
 -typedef unsigned long long  __v1du __attribute__ ((__vector_size__ (8)));
 -typedef int                 __v2si __attribute__ ((__vector_size__ (8)));
 -typedef unsigned int        __v2su __attribute__ ((__vector_size__ (8)));
 -typedef short               __v4hi __attribute__ ((__vector_size__ (8)));
 -typedef unsigned short      __v4hu __attribute__ ((__vector_size__ (8)));
 -typedef char                __v8qi __attribute__ ((__vector_size__ (8)));
 -typedef unsigned char       __v8qu __attribute__ ((__vector_size__ (8)));
 -
 -typedef struct  __v2df  { __v1df hi; __v1df lo; } __v2df;
 -typedef struct  __v2di  { __v1di hi; __v1di lo; } __v2di;
 -typedef struct  __v2du  { __v1du hi; __v1du lo; } __v2du;
 -typedef struct  __v4si  { __v2si hi; __v2si lo; } __v4si;
 -typedef struct  __v4su  { __v2su hi; __v2su lo; } __v4su;
 -typedef struct  __v8hi  { __v4hi hi; __v4hi lo; } __v8hi;
 -typedef struct  __v8hu  { __v4hu hi; __v4hu lo; } __v8hu;
 -typedef struct  __v16qi { __v8qi hi; __v8qi lo; } __v16qi;
 -typedef struct  __v16qu { __v8qu hi; __v8qu lo; } __v16qu;
 -
 -typedef int         __m64  __attribute__ ((__vector_size__ (8), __may_alias__));
 -typedef long long   __m64i __attribute__ ((__vector_size__ (8), __may_alias__));
 -typedef double      __m64d __attribute__ ((__vector_size__ (8), __may_alias__));
 -
 -typedef struct  __m128  { __m64  hi; __m64  lo; } __m128;
 -typedef struct  __m128i { __m64i hi; __m64i lo; } __m128i;
 -typedef struct  __m128d { __m64d hi; __m64d lo; } __m128d;
 -
 -/* Create a selector for use with the SHUFPD instruction.  */
 -#define _MM_SHUFFLE2(fp1,fp0) \
 - (((fp1) << 1) | (fp0))
 -
 -/* Create a vector with element 0 as F and the rest zero.  */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_set_sd (double __F)
 -{
 -  /* return __extension__ (__m128d){ __F, 0.0 }; */
 -  __m128d val;
 -  val.lo = (__m64d){ __F };
 -  val.hi = (__m64d){ 0.0 };
 -  return __extension__ val;
 -}
 -
 -/* Create a vector with both elements equal to F.  */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_set1_pd (double __F)
 -{
 -  /* return __extension__ (__m128d){ __F, __F }; */
 -  __m128d val;
 -  val.lo = (__m64d){ __F };
 -  val.hi = (__m64d){ __F };
 -  return __extension__ val;
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_set_pd1 (double __F)
 -{
 -  return _mm_set1_pd (__F);
 -}
 -
 -/* Create a vector with the lower value X and upper value W.  */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_set_pd (double __W, double __X)
 -{
 -  /* return __extension__ (__m128d){ __X, __W }; */
 -  __m128d val;
 -  val.lo = (__m64d){ __X };
 -  val.hi = (__m64d){ __W };
 -  return __extension__ val;
 -}
 -
 -/* Create a vector with the lower value W and upper value X.  */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_setr_pd (double __W, double __X)
 -{
 -  /* return __extension__ (__m128d){ __W, __X }; */
 -  __m128d val;
 -  val.lo = (__m64d){ __W };
 -  val.hi = (__m64d){ __X };
 -  return __extension__ val;
 -}
 -
 -/* Create an undefined vector.  */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_undefined_pd (void)
 -{
 -  __m128d __Y = __Y;
 -  return __Y;
 -}
 -
 -/* Create a vector of zeros.  */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_setzero_pd (void)
 -{
 -  /* return __extension__ (__m128d){ 0.0, 0.0 }; */
 -  __m128d val;
 -  val.lo = (__m64d){ 0.0 };
 -  val.hi = (__m64d){ 0.0 };
 -  return __extension__ val;
 -}
 -
 -#if 0 /* FIXME */
 -/* Sets the low DPFP value of A from the low value of B.  */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_move_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_load_pd (double const *__P)
 -{
 -  return *(__m128d *)__P;
 -}
 -
 -/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_loadu_pd (double const *__P)
 -{
 -  return __builtin_ia32_loadupd (__P);
 -}
 -#endif
 -
 -/* Create a vector with all two elements equal to *P.  */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_load1_pd (double const *__P)
 -{
 -  return _mm_set1_pd (*__P);
 -}
 -
 -/* Create a vector with element 0 as *P and the rest zero.  */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_load_sd (double const *__P)
 -{
 -  return _mm_set_sd (*__P);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_load_pd1 (double const *__P)
 -{
 -  return _mm_load1_pd (__P);
 -}
 -
 -#if 0 /* FIXME */
 -/* Load two DPFP values in reverse order.  The address must be aligned.  */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_loadr_pd (double const *__P)
 -{
 -  __m128d __tmp = _mm_load_pd (__P);
 -  return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
 -}
 -
 -/* Store two DPFP values.  The address must be 16-byte aligned.  */
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_store_pd (double *__P, __m128d __A)
 -{
 -  *(__m128d *)__P = __A;
 -}
 -
 -/* Store two DPFP values.  The address need not be 16-byte aligned.  */
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_storeu_pd (double *__P, __m128d __A)
 -{
 -  __builtin_ia32_storeupd (__P, __A);
 -}
 -#endif
 -
 -/* Stores the lower DPFP value.  */
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_store_sd (double *__P, __m128d __A)
 -{
 -  /* *__P = ((__v2df)__A)[0]; */
 -  __asm__ volatile (
 -    "sdc1       %[lo],      %[__P]                                      \n\t"
 -    ::[lo]"f"((__m64i)__A.lo),      [__P]"m"(*__P)
 -    : "memory"
 -  );
 -}
 -
 -extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtsd_f64 (__m128d __A)
 -{
 -  /* return ((__v2df)__A)[0]; */
 -  double val;
 -  __asm__ volatile (
 -    "ldc1       %[val],      %[lo]                                      \n\t"
 -    : [val]"=&f"(val)
 -    : [lo]"m"(__A.lo)
 -  );
 -  return val;
 -}
 -
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_storel_pd (double *__P, __m128d __A)
 -{
 -  _mm_store_sd (__P, __A);
 -}
 -
 -/* Stores the upper DPFP value.  */
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_storeh_pd (double *__P, __m128d __A)
 -{
 -  /* *__P = ((__v2df)__A)[1]; */
 -  __asm__ volatile (
 -    "sdc1       %[hi],      %[__P]                                      \n\t"
 -    ::[hi]"f"((__m64i)__A.hi),      [__P]"m"(*__P)
 -    : "memory"
 -  );
 -}
 -
 -#if 0 /* FIXME */
 -/* Store the lower DPFP value across two words.
 -   The address must be 16-byte aligned.  */
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_store1_pd (double *__P, __m128d __A)
 -{
 -  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
 -}
 -
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_store_pd1 (double *__P, __m128d __A)
 -{
 -  _mm_store1_pd (__P, __A);
 -}
 -
 -/* Store two DPFP values in reverse order.  The address must be aligned.  */
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_storer_pd (double *__P, __m128d __A)
 -{
 -  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1)));
 -}
 -#endif
 -
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtsi128_si32 (__m128i __A)
 -{
 -  /* return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0); */
 -  int val;
 -  __asm__ volatile (
 -    "mfc1       %[val],    %[lo]                                        \n\t"
 -    : [val]"=&r"(val)
 -    : [lo]"f"(__A.lo)
 -  );
 -  return val;
 -}
 -
 -extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtsi128_si64 (__m128i __A)
 -{
 -  /* return ((__v2di)__A)[0]; */
 -  long long val;
 -  __asm__ volatile (
 -    "dmfc1      %[val],    %[lo]                                        \n\t"
 -    : [val]"=&r"(val)
 -    : [lo]"f"(__A.lo)
 -  );
 -  return val;
 -}
 -
 -extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtsi128_si64x (__m128i __A)
 -{
 -  /* return ((__v2di)__A)[0]; */
 -  long long val;
 -  __asm__ volatile (
 -    "dmfc1      %[val],    %[lo]                                        \n\t"
 -    : [val]"=&r"(val)
 -    : [lo]"f"(__A.lo)
 -  );
 -  return val;
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_add_pd (__m128d __A, __m128d __B)
 -{
 -  /* return (__m128d) ((__v2df)__A + (__v2df)__B); */
 -  __m128d val;
 -  val.lo = (__m64d) ((__v1df)__A.lo + (__v1df)__B.lo);
 -  val.hi = (__m64d) ((__v1df)__A.hi + (__v1df)__B.hi);
 -  return val;
 -}
 -
 -#if 0 /* FIXME */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_add_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
 -}
 -#endif
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_sub_pd (__m128d __A, __m128d __B)
 -{
 -  /* return (__m128d) ((__v2df)__A - (__v2df)__B); */
 -  __m128d val;
 -  val.lo = (__m64d) ((__v1df)__A.lo - (__v1df)__B.lo);
 -  val.hi = (__m64d) ((__v1df)__A.hi - (__v1df)__B.hi);
 -  return val;
 -}
 -
 -#if 0 /* FIXME */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_sub_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
 -}
 -#endif
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_mul_pd (__m128d __A, __m128d __B)
 -{
 -  /* return (__m128d) ((__v2df)__A * (__v2df)__B); */
 -  __m128d val;
 -  val.lo = (__m64d) ((__v1df)__A.lo * (__v1df)__B.lo);
 -  val.hi = (__m64d) ((__v1df)__A.hi * (__v1df)__B.hi);
 -  return val;
 -}
 -
 -#if 0 /* FIXME */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_mul_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
 -}
 -#endif
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_div_pd (__m128d __A, __m128d __B)
 -{
 -  /* return (__m128d) ((__v2df)__A / (__v2df)__B); */
 -  __m128d val;
 -  val.lo = (__m64d) ((__v1df)__A.lo / (__v1df)__B.lo);
 -  val.hi = (__m64d) ((__v1df)__A.hi / (__v1df)__B.hi);
 -  return val;
 -}
 -
 -#if 0 /* FIXME */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_div_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_sqrt_pd (__m128d __A)
 -{
 -  return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
 -}
 -
 -/* Return pair {sqrt (B[0]), A[1]}.  */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_sqrt_sd (__m128d __A, __m128d __B)
 -{
 -  __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
 -  return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_min_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_min_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_max_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_max_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_and_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_andnot_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_or_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_xor_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpeq_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmplt_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmple_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpgt_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpge_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpneq_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpnlt_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpnle_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpngt_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpnge_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpord_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpunord_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpeq_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmplt_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmple_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpgt_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
 -					 (__v2df)
 -					 __builtin_ia32_cmpltsd ((__v2df) __B,
 -								 (__v2df)
 -								 __A));
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpge_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
 -					 (__v2df)
 -					 __builtin_ia32_cmplesd ((__v2df) __B,
 -								 (__v2df)
 -								 __A));
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpneq_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpnlt_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpnle_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpngt_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
 -					 (__v2df)
 -					 __builtin_ia32_cmpnltsd ((__v2df) __B,
 -								  (__v2df)
 -								  __A));
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpnge_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
 -					 (__v2df)
 -					 __builtin_ia32_cmpnlesd ((__v2df) __B,
 -								  (__v2df)
 -								  __A));
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpord_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpunord_sd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_comieq_sd (__m128d __A, __m128d __B)
 -{
 -  return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_comilt_sd (__m128d __A, __m128d __B)
 -{
 -  return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_comile_sd (__m128d __A, __m128d __B)
 -{
 -  return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_comigt_sd (__m128d __A, __m128d __B)
 -{
 -  return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_comige_sd (__m128d __A, __m128d __B)
 -{
 -  return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_comineq_sd (__m128d __A, __m128d __B)
 -{
 -  return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_ucomieq_sd (__m128d __A, __m128d __B)
 -{
 -  return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_ucomilt_sd (__m128d __A, __m128d __B)
 -{
 -  return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_ucomile_sd (__m128d __A, __m128d __B)
 -{
 -  return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_ucomigt_sd (__m128d __A, __m128d __B)
 -{
 -  return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_ucomige_sd (__m128d __A, __m128d __B)
 -{
 -  return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_ucomineq_sd (__m128d __A, __m128d __B)
 -{
 -  return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
 -}
 -#endif
 -
 -/* Create a vector of Qi, where i is the element number.  */
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_set_epi64x (long long __q1, long long __q0)
 -{
 -  /* return __extension__ (__m128i)(__v2di){ __q0, __q1 }; */
 -  __m128i val;
 -  val.lo = (__m64i)(__v1di){ __q0 };
 -  val.hi = (__m64i)(__v1di){ __q1 };
 -  return __extension__ val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_set_epi64 (__m64 __q1,  __m64 __q0)
 -{
 -  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
 -{
 -  /* return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; */
 -  __m128i val;
 -  val.lo = (__m64i)(__v2si){ __q0, __q1 };
 -  val.hi = (__m64i)(__v2si){ __q2, __q3 };
 -  return __extension__ val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
 -	       short __q3, short __q2, short __q1, short __q0)
 -{
 -  /* return __extension__ (__m128i)(__v8hi){
 -      __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 }; */
 -  __m128i val;
 -  val.lo = (__m64i)(__v4hi){ __q0, __q1, __q2, __q3 };
 -  val.hi = (__m64i)(__v4hi){ __q4, __q5, __q6, __q7 };
 -  return __extension__ val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
 -	      char __q11, char __q10, char __q09, char __q08,
 -	      char __q07, char __q06, char __q05, char __q04,
 -	      char __q03, char __q02, char __q01, char __q00)
 -{
 -  /* return __extension__ (__m128i)(__v16qi){
 -      __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
 -      __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
 -    }; */
 -  __m128i val;
 -  val.lo = (__m64i)(__v8qi){ __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07 };
 -  val.hi = (__m64i)(__v8qi){ __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 };
 -  return __extension__ val;
 -}
 -
 -/* Set all of the elements of the vector to A.  */
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_set1_epi64x (long long __A)
 -{
 -  return _mm_set_epi64x (__A, __A);
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_set1_epi64 (__m64 __A)
 -{
 -  return _mm_set_epi64 (__A, __A);
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_set1_epi32 (int __A)
 -{
 -  return _mm_set_epi32 (__A, __A, __A, __A);
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_set1_epi16 (short __A)
 -{
 -  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_set1_epi8 (char __A)
 -{
 -  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
 -		       __A, __A, __A, __A, __A, __A, __A, __A);
 -}
 -
 -/* Create a vector of Qi, where i is the element number.
 -   The parameter order is reversed from the _mm_set_epi* functions.  */
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_setr_epi64 (__m64 __q0, __m64 __q1)
 -{
 -  return _mm_set_epi64 (__q1, __q0);
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
 -{
 -  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
 -	        short __q4, short __q5, short __q6, short __q7)
 -{
 -  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
 -	       char __q04, char __q05, char __q06, char __q07,
 -	       char __q08, char __q09, char __q10, char __q11,
 -	       char __q12, char __q13, char __q14, char __q15)
 -{
 -  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
 -		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
 -}
 -
 -/* Create a vector with element 0 as *P and the rest zero.  */
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_load_si128 (__m128i const *__P)
 -{
 -  return *__P;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_loadu_si128 (__m128i const *__P)
 -{
 -  /* return (__m128i) __builtin_ia32_loaddqu ((char const *)__P); */
 -  __m128i val;
 -  __asm__ volatile (
 -    "gsldlc1    %[lo],      0x07(%[__P])                                \n\t"
 -    "gsldrc1    %[lo],      0x00(%[__P])                                \n\t"
 -    "gsldlc1    %[hi],      0x0f(%[__P])                                \n\t"
 -    "gsldrc1    %[hi],      0x08(%[__P])                                \n\t"
 -    : [hi]"=&f"(val.hi),            [lo]"=&f"(val.lo)
 -    : [__P]"r"(__P)
 -  );
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_loadl_epi64 (__m128i const *__P)
 -{
 -  return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
 -}
 -
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_store_si128 (__m128i *__P, __m128i __B)
 -{
 -  /* *__P = __B; */
 -  __asm__ volatile (
 -    "gssqc1     %[hi],      %[lo],      0x00(%[__P])                    \n\t"
 -    ::[hi]"f"(__B.hi),              [lo]"f"(__B.lo),
 -      [__P]"r"(__P)
 -    : "memory"
 -  );
 -}
 -
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_storeu_si128 (__m128i *__P, __m128i __B)
 -{
 -  /* __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B); */
 -  __asm__ volatile (
 -    "gssdlc1    %[lo],      0x07(%[__P])                                \n\t"
 -    "gssdrc1    %[lo],      0x00(%[__P])                                \n\t"
 -    "gssdlc1    %[hi],      0x0f(%[__P])                                \n\t"
 -    "gssdrc1    %[hi],      0x08(%[__P])                                \n\t"
 -    ::[hi]"f"(__B.hi),              [lo]"f"(__B.lo),
 -      [__P]"r"(__P)
 -    : "memory"
 -  );
 -}
 -
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_storel_epi64 (__m128i *__P, __m128i __B)
 -{
 -  /* *(long long *)__P = ((__v2di)__B)[0]; */
 -  __asm__ volatile (
 -    "sdc1       %[lo],      %[__P]                                      \n\t"
 -    ::[lo]"f"(__B.lo),              [__P]"m"(*__P)
 -    : "memory"
 -  );
 -}
 -
 -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_movepi64_pi64 (__m128i __B)
 -{
 -  /* return (__m64) ((__v2di)__B)[0]; */
 -  __m64 val;
 -  __asm__ volatile (
 -    "dmfc1      %[val],    %[lo]                                        \n\t"
 -    : [val]"=&r"(val)
 -    : [lo]"f"(__B.lo)
 -  );
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_movpi64_epi64 (__m64 __A)
 -{
 -  return _mm_set_epi64 ((__m64)0LL, __A);
 -}
 -
 -#if 0 /* FIXME */
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_move_epi64 (__m128i __A)
 -{
 -  return (__m128i)__builtin_ia32_movq128 ((__v2di) __A);
 -}
 -
 -/* Create an undefined vector.  */
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_undefined_si128 (void)
 -{
 -  __m128i __Y = __Y;
 -  return __Y;
 -}
 -#endif
 -
 -/* Create a vector of zeros.  */
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_setzero_si128 (void)
 -{
 -  /* return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 }; */
 -  __m128i val;
 -  val.hi = (__m64i)(__v2si){ 0, 0 };
 -  val.lo = (__m64i)(__v2si){ 0, 0 };
 -  return val;
 -}
 -
 -#if 0 /* FIXME */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtepi32_pd (__m128i __A)
 -{
 -  return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
 -}
 -
 -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtepi32_ps (__m128i __A)
 -{
 -  return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtpd_epi32 (__m128d __A)
 -{
 -  return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
 -}
 -
 -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtpd_pi32 (__m128d __A)
 -{
 -  return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
 -}
 -
 -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtpd_ps (__m128d __A)
 -{
 -  return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvttpd_epi32 (__m128d __A)
 -{
 -  return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
 -}
 -
 -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvttpd_pi32 (__m128d __A)
 -{
 -  return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtpi32_pd (__m64 __A)
 -{
 -  return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtps_epi32 (__m128 __A)
 -{
 -  return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvttps_epi32 (__m128 __A)
 -{
 -  return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtps_pd (__m128 __A)
 -{
 -  return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
 -}
 -
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtsd_si32 (__m128d __A)
 -{
 -  return __builtin_ia32_cvtsd2si ((__v2df) __A);
 -}
 -
 -#ifdef __x86_64__
 -/* Intel intrinsic.  */
 -extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtsd_si64 (__m128d __A)
 -{
 -  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
 -}
 -
 -/* Microsoft intrinsic.  */
 -extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtsd_si64x (__m128d __A)
 -{
 -  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
 -}
 -#endif
 -
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvttsd_si32 (__m128d __A)
 -{
 -  return __builtin_ia32_cvttsd2si ((__v2df) __A);
 -}
 -
 -#ifdef __x86_64__
 -/* Intel intrinsic.  */
 -extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvttsd_si64 (__m128d __A)
 -{
 -  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
 -}
 -
 -/* Microsoft intrinsic.  */
 -extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvttsd_si64x (__m128d __A)
 -{
 -  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
 -}
 -#endif
 -
 -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtsd_ss (__m128 __A, __m128d __B)
 -{
 -  return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtsi32_sd (__m128d __A, int __B)
 -{
 -  return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
 -}
 -
 -#ifdef __x86_64__
 -/* Intel intrinsic.  */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtsi64_sd (__m128d __A, long long __B)
 -{
 -  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
 -}
 -
 -/* Microsoft intrinsic.  */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtsi64x_sd (__m128d __A, long long __B)
 -{
 -  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
 -}
 -#endif
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtss_sd (__m128d __A, __m128 __B)
 -{
 -  return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
 -{
 -  return (__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, __mask);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_unpackhi_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_unpacklo_pd (__m128d __A, __m128d __B)
 -{
 -  return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
 -}
 -#endif
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_loadh_pd (__m128d __A, double const *__B)
 -{
 -  /* return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B); */
 -  __asm__ volatile (
 -    "sdc1       %[__B],     0x00+%[ahi]                                 \n\t"
 -    : [ahi]"=m"(__A.hi)
 -    : [__B]"f"(*__B)
 -    : "memory"
 -  );
 -  return __A;
 -}
 -
 -#if 0 /* FIXME */
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_loadl_pd (__m128d __A, double const *__B)
 -{
 -  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
 -}
 -
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_movemask_pd (__m128d __A)
 -{
 -  return __builtin_ia32_movmskpd ((__v2df)__A);
 -}
 -#endif
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_packs_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_packsshb ((int16x4_t)__A.hi, (int16x4_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_packsshb ((int16x4_t)__A.lo, (int16x4_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_packs_epi32 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_packsswh ((int32x2_t)__A.hi, (int32x2_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_packsswh ((int32x2_t)__A.lo, (int32x2_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_packus_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_packushb ((uint16x4_t)__A.hi, (uint16x4_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_packushb ((uint16x4_t)__A.lo, (uint16x4_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_punpckhbh_s ((int8x8_t)__A.hi, (int8x8_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_punpckhbh_s ((int8x8_t)__A.lo, (int8x8_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_punpckhhw_s ((int16x4_t)__A.hi, (int16x4_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_punpckhhw_s ((int16x4_t)__A.lo, (int16x4_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_punpckhwd_s ((__v2si)__A.hi, (__v2si)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_punpckhwd_s ((__v2si)__A.lo, (__v2si)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B); */
 -  __m128i val;
 -  __asm__ volatile (
 -    "mov.d      %[vlo],     %[ahi]                                      \n\t"
 -    "mov.d      %[vhi],     %[bhi]                                      \n\t"
 -    : [vhi]"=&f"(val.hi),           [vlo]"=&f"(val.lo)
 -    : [ahi]"f"(__A.hi),             [bhi]"f"(__B.hi)
 -  );
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_punpcklbh_s ((int8x8_t)__A.hi, (int8x8_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_punpcklbh_s ((int8x8_t)__A.lo, (int8x8_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_punpcklhw_s ((int16x4_t)__A.hi, (int16x4_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_punpcklhw_s ((int16x4_t)__A.lo, (int16x4_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_punpcklwd_s ((__v2si)__A.hi, (__v2si)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_punpcklwd_s ((__v2si)__A.lo, (__v2si)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B); */
 -  __m128i val;
 -  __asm__ volatile (
 -    "mov.d      %[vlo],     %[alo]                                      \n\t"
 -    "mov.d      %[vhi],     %[blo]                                      \n\t"
 -    : [vhi]"=&f"(val.hi),           [vlo]"=&f"(val.lo)
 -    : [alo]"f"(__A.lo),             [blo]"f"(__B.lo)
 -  );
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_add_epi8 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v16qu)__A + (__v16qu)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v8qu)__A.hi + (__v8qu)__B.hi);
 -  val.lo = (__m64i) ((__v8qu)__A.lo + (__v8qu)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_add_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v8hu)__A + (__v8hu)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v4hu)__A.hi + (__v4hu)__B.hi);
 -  val.lo = (__m64i) ((__v4hu)__A.lo + (__v4hu)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_add_epi32 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v4su)__A + (__v4su)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v2su)__A.hi + (__v2su)__B.hi);
 -  val.lo = (__m64i) ((__v2su)__A.lo + (__v2su)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_add_epi64 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v2du)__A + (__v2du)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v1du)__A.hi + (__v1du)__B.hi);
 -  val.lo = (__m64i) ((__v1du)__A.lo + (__v1du)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_adds_epi8 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_paddsb ((int8x8_t)__A.hi, (int8x8_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_paddsb ((int8x8_t)__A.lo, (int8x8_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_adds_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_paddsh ((int16x4_t)__A.hi, (int16x4_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_paddsh ((int16x4_t)__A.lo, (int16x4_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_adds_epu8 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_paddusb ((uint8x8_t)__A.hi, (uint8x8_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_paddusb ((uint8x8_t)__A.lo, (uint8x8_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_adds_epu16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_paddush ((uint16x4_t)__A.hi, (uint16x4_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_paddush ((uint16x4_t)__A.lo, (uint16x4_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_sub_epi8 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v16qu)__A - (__v16qu)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v8qu)__A.hi - (__v8qu)__B.hi);
 -  val.lo = (__m64i) ((__v8qu)__A.lo - (__v8qu)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_sub_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v8hu)__A - (__v8hu)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v4hu)__A.hi - (__v4hu)__B.hi);
 -  val.lo = (__m64i) ((__v4hu)__A.lo - (__v4hu)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_sub_epi32 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v4su)__A - (__v4su)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v2su)__A.hi - (__v2su)__B.hi);
 -  val.lo = (__m64i) ((__v2su)__A.lo - (__v2su)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_sub_epi64 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v2du)__A - (__v2du)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v1du)__A.hi - (__v1du)__B.hi);
 -  val.lo = (__m64i) ((__v1du)__A.lo - (__v1du)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_subs_epi8 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_psubsb ((int8x8_t)__A.hi, (int8x8_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_psubsb ((int8x8_t)__A.lo, (int8x8_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_subs_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_psubsh ((int16x4_t)__A.hi, (int16x4_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_psubsh ((int16x4_t)__A.lo, (int16x4_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_subs_epu8 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_psubusb ((uint8x8_t)__A.hi, (uint8x8_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_psubusb ((uint8x8_t)__A.lo, (uint8x8_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_subs_epu16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_psubush ((uint16x4_t)__A.hi, (uint16x4_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_psubush ((uint16x4_t)__A.lo, (uint16x4_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_madd_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_pmaddhw ((int16x4_t)__A.hi, (int16x4_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_pmaddhw ((int16x4_t)__A.lo, (int16x4_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_mulhi_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_pmulhh ((int16x4_t)__A.hi, (int16x4_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_pmulhh ((int16x4_t)__A.lo, (int16x4_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_mullo_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v8hu)__A * (__v8hu)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v4hu)__A.hi * (__v4hu)__B.hi);
 -  val.lo = (__m64i) ((__v4hu)__A.lo * (__v4hu)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_mul_su32 (__m64 __A, __m64 __B)
 -{
 -  /* return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B); */
 -  __m64 val;
 -  val = (__m64) __builtin_loongson_pmuluw ((uint32x2_t)__A, (uint32x2_t)__B);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_mul_epu32 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_pmuluw ((uint32x2_t)__A.hi, (uint32x2_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_pmuluw ((uint32x2_t)__A.lo, (uint32x2_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_slli_epi16 (__m128i __A, int __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_psllh_s ((int16x4_t)__A.hi, (uint8_t)__B);
 -  val.lo = (__m64i) __builtin_loongson_psllh_s ((int16x4_t)__A.lo, (uint8_t)__B);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_slli_epi32 (__m128i __A, int __B)
 -{
 -  /* return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_psllw_s ((int32x2_t)__A.hi, (uint8_t)__B);
 -  val.lo = (__m64i) __builtin_loongson_psllw_s ((int32x2_t)__A.lo, (uint8_t)__B);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_slli_epi64 (__m128i __A, int __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B); */
 -  __m128i val;
 -  __asm__ volatile (
 -    "dsll       %[lo],      %[lo],      %[__B]                          \n\t"
 -    "dsll       %[hi],      %[hi],      %[__B]                          \n\t"
 -    : [hi]"=&f"(__A.hi),            [lo]"=&f"(__A.lo)
 -    : [__B]"f"(__B)
 -  );
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_srai_epi16 (__m128i __A, int __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_psrah_s ((int16x4_t)__A.hi, (uint8_t)__B);
 -  val.lo = (__m64i) __builtin_loongson_psrah_s ((int16x4_t)__A.lo, (uint8_t)__B);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_srai_epi32 (__m128i __A, int __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_psraw_s ((int32x2_t)__A.hi, (uint8_t)__B);
 -  val.lo = (__m64i) __builtin_loongson_psraw_s ((int32x2_t)__A.lo, (uint8_t)__B);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_bsrli_si128 (__m128i __A, const int __N)
 -{
 -  /* return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_psrlw_s ((int32x2_t)__A.hi, (uint8_t)(__N<<3));
 -  val.lo = (__m64i) __builtin_loongson_psrlw_s ((int32x2_t)__A.lo, (uint8_t)(__N<<3));
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_bslli_si128 (__m128i __A, const int __N)
 -{
 -  /* return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_psllw_s ((int32x2_t)__A.hi, (uint8_t)(__N<<3));
 -  val.lo = (__m64i) __builtin_loongson_psllw_s ((int32x2_t)__A.lo, (uint8_t)(__N<<3));
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_srli_si128 (__m128i __A, const int __N)
 -{
 -  /* return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_psrlw_s ((int32x2_t)__A.hi, (uint8_t)(__N<<3));
 -  val.lo = (__m64i) __builtin_loongson_psrlw_s ((int32x2_t)__A.lo, (uint8_t)(__N<<3));
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_slli_si128 (__m128i __A, const int __N)
 -{
 -  /* return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_psllw_s ((int32x2_t)__A.hi, (uint8_t)(__N<<3));
 -  val.lo = (__m64i) __builtin_loongson_psllw_s ((int32x2_t)__A.lo, (uint8_t)(__N<<3));
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_srli_epi16 (__m128i __A, int __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_psrlh_s ((int16x4_t)__A.hi, (uint8_t)__B);
 -  val.lo = (__m64i) __builtin_loongson_psrlh_s ((int16x4_t)__A.lo, (uint8_t)__B);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_srli_epi32 (__m128i __A, int __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_psrlw_s ((int32x2_t)__A.hi, (uint8_t)__B);
 -  val.lo = (__m64i) __builtin_loongson_psrlw_s ((int32x2_t)__A.lo, (uint8_t)__B);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_srli_epi64 (__m128i __A, int __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B); */
 -  __m128i val;
 -  __asm__ volatile (
 -    "dsrl       %[vlo],     %[lo],      %[__B]                          \n\t"
 -    "dsrl       %[vhi],     %[hi],      %[__B]                          \n\t"
 -    : [vhi]"=&f"(val.hi),           [vlo]"=&f"(val.lo)
 -    : [hi]"f"(__A.hi),              [lo]"f"(__A.lo),
 -      [__B]"f"(__B)
 -  );
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_sll_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  __asm__ volatile (
 -    "psllh      %[vlo],     %[lo],      %[__B]                          \n\t"
 -    "psllh      %[vhi],     %[hi],      %[__B]                          \n\t"
 -    : [vhi]"=&f"(val.hi),           [vlo]"=&f"(val.lo)
 -    : [hi]"f"(__A.hi),              [lo]"f"(__A.lo),
 -      [__B]"f"(__B.lo)
 -  );
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_sll_epi32 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B); */
 -  __m128i val;
 -  __asm__ volatile (
 -    "psllw      %[vlo],     %[lo],      %[__B]                          \n\t"
 -    "psllw      %[vhi],     %[hi],      %[__B]                          \n\t"
 -    : [vhi]"=&f"(val.hi),           [vlo]"=&f"(val.lo)
 -    : [hi]"f"(__A.hi),              [lo]"f"(__A.lo),
 -      [__B]"f"(__B.lo)
 -  );
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_sll_epi64 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B); */
 -  __m128i val;
 -  __asm__ volatile (
 -    "dsll       %[vlo],     %[lo],      %[__B]                          \n\t"
 -    "dsll       %[vhi],     %[hi],      %[__B]                          \n\t"
 -    : [vhi]"=&f"(val.hi),           [vlo]"=&f"(val.lo)
 -    : [hi]"f"(__A.hi),              [lo]"f"(__A.lo),
 -      [__B]"f"(__B.lo)
 -  );
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_sra_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  __asm__ volatile (
 -    "psrah      %[vlo],     %[lo],      %[__B]                          \n\t"
 -    "psrah      %[vhi],     %[hi],      %[__B]                          \n\t"
 -    : [vhi]"=&f"(val.hi),           [vlo]"=&f"(val.lo)
 -    : [hi]"f"(__A.hi),              [lo]"f"(__A.lo),
 -      [__B]"f"(__B.lo)
 -  );
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_sra_epi32 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B); */
 -  __m128i val;
 -  __asm__ volatile (
 -    "dsra       %[vlo],     %[lo],      %[__B]                          \n\t"
 -    "dsra       %[vhi],     %[hi],      %[__B]                          \n\t"
 -    : [vhi]"=&f"(val.hi),           [vlo]"=&f"(val.lo)
 -    : [hi]"f"(__A.hi),              [lo]"f"(__A.lo),
 -      [__B]"f"(__B.lo)
 -  );
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_srl_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  __asm__ volatile (
 -    "psrlh      %[vlo],     %[lo],      %[__B]                          \n\t"
 -    "psrlh      %[vhi],     %[hi],      %[__B]                          \n\t"
 -    : [vhi]"=&f"(val.hi),           [vlo]"=&f"(val.lo)
 -    : [hi]"f"(__A.hi),              [lo]"f"(__A.lo),
 -      [__B]"f"(__B.lo)
 -  );
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_srl_epi32 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B); */
 -  __m128i val;
 -  __asm__ volatile (
 -    "psrlw      %[vlo],     %[lo],      %[__B]                          \n\t"
 -    "psrlw      %[vhi],     %[hi],      %[__B]                          \n\t"
 -    : [vhi]"=&f"(val.hi),           [vlo]"=&f"(val.lo)
 -    : [hi]"f"(__A.hi),              [lo]"f"(__A.lo),
 -      [__B]"f"(__B.lo)
 -  );
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_srl_epi64 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B); */
 -  __m128i val;
 -  __asm__ volatile (
 -    "dsrl       %[lo],      %[lo],      %[__B]                          \n\t"
 -    "dsrl       %[hi],      %[hi],      %[__B]                          \n\t"
 -    : [hi]"=&f"(__A.hi),            [lo]"=&f"(__A.lo)
 -    : [__B]"f"(__B.lo)
 -  );
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_and_si128 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v2du)__A & (__v2du)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v1du)__A.hi & (__v1du)__B.hi);
 -  val.lo = (__m64i) ((__v1du)__A.lo & (__v1du)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_andnot_si128 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B); */
 -  __m128i val;
 -  val.hi = (__m64i)__builtin_loongson_pandn_sd ((int64_t)__A.hi, (int64_t)__B.hi);
 -  val.lo = (__m64i)__builtin_loongson_pandn_sd ((int64_t)__A.lo, (int64_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_or_si128 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v2du)__A | (__v2du)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v1du)__A.hi | (__v1du)__B.hi);
 -  val.lo = (__m64i) ((__v1du)__A.lo | (__v1du)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_xor_si128 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v2du)__A ^ (__v2du)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v1du)__A.hi ^ (__v1du)__B.hi);
 -  val.lo = (__m64i) ((__v1du)__A.lo ^ (__v1du)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v16qi)__A == (__v16qi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v8qi)__A.hi == (__v8qi)__B.hi);
 -  val.lo = (__m64i) ((__v8qi)__A.lo == (__v8qi)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v8hi)__A == (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v4hi)__A.hi == (__v4hi)__B.hi);
 -  val.lo = (__m64i) ((__v4hi)__A.lo == (__v4hi)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v4si)__A == (__v4si)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v2si)__A.hi == (__v2si)__B.hi);
 -  val.lo = (__m64i) ((__v2si)__A.lo == (__v2si)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmplt_epi8 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v16qi)__A < (__v16qi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v8qi)__A.hi < (__v8qi)__B.hi);
 -  val.lo = (__m64i) ((__v8qi)__A.lo < (__v8qi)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmplt_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v8hi)__A < (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v4hi)__A.hi < (__v4hi)__B.hi);
 -  val.lo = (__m64i) ((__v4hi)__A.lo < (__v4hi)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmplt_epi32 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v4si)__A < (__v4si)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v2si)__A.hi < (__v2si)__B.hi);
 -  val.lo = (__m64i) ((__v2si)__A.lo < (__v2si)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v16qi)__A > (__v16qi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v8qi)__A.hi > (__v8qi)__B.hi);
 -  val.lo = (__m64i) ((__v8qi)__A.lo > (__v8qi)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v8hi)__A > (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v4hi)__A.hi > (__v4hi)__B.hi);
 -  val.lo = (__m64i) ((__v4hi)__A.lo > (__v4hi)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i) ((__v4si)__A > (__v4si)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) ((__v2si)__A.hi > (__v2si)__B.hi);
 -  val.lo = (__m64i) ((__v2si)__A.lo > (__v2si)__B.lo);
 -  return val;
 -}
 -
 -#if 0 /* FIXME */
 -#ifdef __OPTIMIZE__
 -extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_extract_epi16 (__m128i const __A, int const __N)
 -{
 -  return (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N);
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
 -{
 -  return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N);
 -}
 -#else
 -#define _mm_extract_epi16(A, N) \
 -  ((int) (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)(__m128i)(A), (int)(N)))
 -#define _mm_insert_epi16(A, D, N)				\
 -  ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(__m128i)(A),	\
 -					  (int)(D), (int)(N)))
 -#endif
 -#endif
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_max_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_pmaxsh ((int16x4_t)__A.hi, (int16x4_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_pmaxsh ((int16x4_t)__A.lo, (int16x4_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_max_epu8 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_pmaxub ((uint8x8_t)__A.hi, (uint8x8_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_pmaxub ((uint8x8_t)__A.lo, (uint8x8_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_min_epi16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_pminsh ((int16x4_t)__A.hi, (int16x4_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_pminsh ((int16x4_t)__A.lo, (int16x4_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_min_epu8 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_pminub ((uint8x8_t)__A.hi, (uint8x8_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_pminub ((uint8x8_t)__A.lo, (uint8x8_t)__B.lo);
 -  return val;
 -}
 -
 -/*  FIXME: return int8x8_t, not int */
 -extern __inline int8x8_t __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_movemask_epi8 (__m128i __A)
 -{
 -  /* return __builtin_ia32_pmovmskb128 ((__v16qi)__A); */
 -  int8x8_t val;
 -  val  = __builtin_loongson_pmovmskb_s ((int8x8_t)__A.hi);
 -  val  = val << 8;
 -  val |= __builtin_loongson_pmovmskb_s ((int8x8_t)__A.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_mulhi_epu16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_pmulhuh ((uint16x4_t)__A.hi, (uint16x4_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_pmulhuh ((uint16x4_t)__A.lo, (uint16x4_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_shufflehi_epi16 (__m128i __A, const int __mask)
 -{
 -  /* return (__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __mask); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_pshufh_s ((int16x4_t)__A.hi, (uint8_t)__mask);
 -  val.lo = (__m64i) __builtin_loongson_pshufh_s ((int16x4_t)__A.lo, (uint8_t)__mask);
 -  return val;
 -}
 -
 -#if 0 /* FIXME */
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_shufflelo_epi16 (__m128i __A, const int __mask)
 -{
 -  return (__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __mask);
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_shuffle_epi32 (__m128i __A, const int __mask)
 -{
 -  return (__m128i)__builtin_ia32_pshufd ((__v4si)__A, __mask);
 -}
 -
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
 -{
 -  __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
 -}
 -#endif
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_avg_epu8 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_pavgb ((uint8x8_t)__A.hi, (uint8x8_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_pavgb ((uint8x8_t)__A.lo, (uint8x8_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_avg_epu16 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_pavgh ((uint16x4_t)__A.hi, (uint16x4_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_pavgh ((uint16x4_t)__A.lo, (uint16x4_t)__B.lo);
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_sad_epu8 (__m128i __A, __m128i __B)
 -{
 -  /* return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B); */
 -  __m128i val;
 -  val.hi = (__m64i) __builtin_loongson_psadbh ((uint8x8_t)__A.hi, (uint8x8_t)__B.hi);
 -  val.lo = (__m64i) __builtin_loongson_psadbh ((uint8x8_t)__A.lo, (uint8x8_t)__B.lo);
 -  return val;
 -}
 -
 -#if 0 /* FIXME */
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_stream_si32 (int *__A, int __B)
 -{
 -  __builtin_ia32_movnti (__A, __B);
 -}
 -
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_stream_si128 (__m128i *__A, __m128i __B)
 -{
 -  __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
 -}
 -
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_stream_pd (double *__A, __m128d __B)
 -{
 -  __builtin_ia32_movntpd (__A, (__v2df)__B);
 -}
 -
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_clflush (void const *__A)
 -{
 -  __builtin_ia32_clflush (__A);
 -}
 -
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_lfence (void)
 -{
 -  __builtin_ia32_lfence ();
 -}
 -
 -extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_mfence (void)
 -{
 -  __builtin_ia32_mfence ();
 -}
 -#endif
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtsi32_si128 (int __A)
 -{
 -  return _mm_set_epi32 (0, 0, 0, __A);
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtsi64_si128 (long long __A)
 -{
 -  return _mm_set_epi64x (0, __A);
 -}
 -
 -/* Microsoft intrinsic.  */
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_cvtsi64x_si128 (long long __A)
 -{
 -  return _mm_set_epi64x (0, __A);
 -}
 -
 -/* Casts between various SP, DP, INT vector types.  Note that these do no
 -   conversion of values, they just change the type.  */
 -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_castpd_ps(__m128d __A)
 -{
 -  /* return (__m128) __A; */
 -  __m128 val;
 -  val.lo = (__m64) __A.lo;
 -  val.hi = (__m64) __A.hi;
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_castpd_si128(__m128d __A)
 -{
 -  /* return (__m128i) __A; */
 -  __m128i val;
 -  val.lo = (__m64i) __A.lo;
 -  val.hi = (__m64i) __A.hi;
 -  return val;
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_castps_pd(__m128 __A)
 -{
 -  /* return (__m128d) __A; */
 -  __m128d val;
 -  val.lo = (__m64d) __A.lo;
 -  val.hi = (__m64d) __A.hi;
 -  return val;
 -}
 -
 -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_castps_si128(__m128 __A)
 -{
 -  /* return (__m128i) __A; */
 -  __m128i val;
 -  val.lo = (__m64i) __A.lo;
 -  val.hi = (__m64i) __A.hi;
 -  return val;
 -}
 -
 -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_castsi128_ps(__m128i __A)
 -{
 -  /* return (__m128) __A; */
 -  __m128 val;
 -  val.lo = (__m64) __A.lo;
 -  val.hi = (__m64) __A.hi;
 -  return val;
 -}
 -
 -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 -_mm_castsi128_pd(__m128i __A)
 -{
 -  /* return (__m128d) __A; */
 -  __m128d val;
 -  val.lo = (__m64d) __A.lo;
 -  val.hi = (__m64d) __A.hi;
 -  return val;
 -}
 -
 -#ifdef __cplusplus
 -}
 -#endif
 -
 +#if !defined(_GCC_LOONGSON_MMIINTRIN_H)
 +#warning  \
 +  loongson.h will be deprecated without further notice at a future date. \
 +  Please use loongson-mmiintrin.h instead.
 +#include "loongson-mmiintrin.h"
 #endif
 diff --git a/gcc/config/mips/loongson.md b/gcc/config/mips/loongson.md
 deleted file mode 100644
 index 88f1487..0000000
 --- a/gcc/config/mips/loongson.md
 +++ /dev/null
 @@ -1,974 +0,0 @@
 -;; Machine description for Loongson-specific patterns, such as
 -;; ST Microelectronics Loongson-2E/2F etc.
 -;; Copyright (C) 2008-2018 Free Software Foundation, Inc.
 -;; Contributed by CodeSourcery.
 -;;
 -;; This file is part of GCC.
 -;;
 -;; GCC is free software; you can redistribute it and/or modify
 -;; it under the terms of the GNU General Public License as published by
 -;; the Free Software Foundation; either version 3, or (at your option)
 -;; any later version.
 -
 -;; GCC is distributed in the hope that it will be useful,
 -;; but WITHOUT ANY WARRANTY; without even the implied warranty of
 -;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 -;; GNU General Public License for more details.
 -
 -;; You should have received a copy of the GNU General Public License
 -;; along with GCC; see the file COPYING3.  If not see
 -;; <http://www.gnu.org/licenses/>.
 -
 -(define_c_enum "unspec" [
 -  UNSPEC_LOONGSON_PAVG
 -  UNSPEC_LOONGSON_PCMPEQ
 -  UNSPEC_LOONGSON_PCMPGT
 -  UNSPEC_LOONGSON_PEXTR
 -  UNSPEC_LOONGSON_PINSRH
 -  UNSPEC_LOONGSON_VINIT
 -  UNSPEC_LOONGSON_PMADD
 -  UNSPEC_LOONGSON_PMOVMSK
 -  UNSPEC_LOONGSON_PMULHU
 -  UNSPEC_LOONGSON_PMULH
 -  UNSPEC_LOONGSON_PMULU
 -  UNSPEC_LOONGSON_PASUBUB
 -  UNSPEC_LOONGSON_BIADD
 -  UNSPEC_LOONGSON_PSADBH
 -  UNSPEC_LOONGSON_PSHUFH
 -  UNSPEC_LOONGSON_PUNPCKH
 -  UNSPEC_LOONGSON_PUNPCKL
 -  UNSPEC_LOONGSON_PADDD
 -  UNSPEC_LOONGSON_PSUBD
 -  UNSPEC_LOONGSON_DSLL
 -  UNSPEC_LOONGSON_DSRL
 -])
 -
 -;; Mode iterators and attributes.
 -
 -;; 64-bit vectors of bytes.
 -(define_mode_iterator VB [V8QI])
 -
 -;; 64-bit vectors of halfwords.
 -(define_mode_iterator VH [V4HI])
 -
 -;; 64-bit vectors of words.
 -(define_mode_iterator VW [V2SI])
 -
 -;; 64-bit vectors of halfwords and bytes.
 -(define_mode_iterator VHB [V4HI V8QI])
 -
 -;; 64-bit vectors of words and halfwords.
 -(define_mode_iterator VWH [V2SI V4HI])
 -
 -;; 64-bit vectors of words and bytes
 -(define_mode_iterator VWB [V2SI V8QI])
 -
 -;; 64-bit vectors of words, halfwords and bytes.
 -(define_mode_iterator VWHB [V2SI V4HI V8QI])
 -
 -;; 64-bit vectors of words, halfwords and bytes; and DImode.
 -(define_mode_iterator VWHBDI [V2SI V4HI V8QI DI])
 -
 -;; The Loongson instruction suffixes corresponding to the modes in the
 -;; VWHBDI iterator.
 -(define_mode_attr V_suffix [(V2SI "w") (V4HI "h") (V8QI "b") (DI "d")])
 -
 -;; Given a vector type T, the mode of a vector half the size of T
 -;; and with the same number of elements.
 -(define_mode_attr V_squash [(V2SI "V2HI") (V4HI "V4QI")])
 -
 -;; Given a vector type T, the mode of a vector the same size as T
 -;; but with half as many elements.
 -(define_mode_attr V_stretch_half [(V2SI "DI") (V4HI "V2SI") (V8QI "V4HI")])
 -
 -;; The Loongson instruction suffixes corresponding to the transformation
 -;; expressed by V_stretch_half.
 -(define_mode_attr V_stretch_half_suffix [(V2SI "wd") (V4HI "hw") (V8QI "bh")])
 -
 -;; Given a vector type T, the mode of a vector the same size as T
 -;; but with twice as many elements.
 -(define_mode_attr V_squash_double [(V2SI "V4HI") (V4HI "V8QI")])
 -
 -;; Given a vector type T, the inner mode.
 -(define_mode_attr V_inner [(V8QI "QI") (V4HI "HI") (V2SI "SI")])
 -
 -;; The Loongson instruction suffixes corresponding to the conversions
 -;; specified by V_half_width.
 -(define_mode_attr V_squash_double_suffix [(V2SI "wh") (V4HI "hb")])
 -
 -;; Move patterns.
 -
 -;; Expander to legitimize moves involving values of vector modes.
 -(define_expand "mov<mode>"
 -  [(set (match_operand:VWHB 0)
 -	(match_operand:VWHB 1))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -{
 -  if (mips_legitimize_move (<MODE>mode, operands[0], operands[1]))
 -    DONE;
 -})
 -
 -;; Handle legitimized moves between values of vector modes.
 -(define_insn "mov<mode>_internal"
 -  [(set (match_operand:VWHB 0 "nonimmediate_operand" "=m,f,d,f,  d,  m,  d")
 -	(match_operand:VWHB 1 "move_operand"          "f,m,f,dYG,dYG,dYG,m"))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  { return mips_output_move (operands[0], operands[1]); }
 -  [(set_attr "move_type" "fpstore,fpload,mfc,mtc,move,store,load")
 -   (set_attr "mode" "DI")])
 -
 -;; Initialization of a vector.
 -
 -(define_expand "vec_init<mode><unitmode>"
 -  [(set (match_operand:VWHB 0 "register_operand")
 -	(match_operand 1 ""))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -{
 -  mips_expand_vector_init (operands[0], operands[1]);
 -  DONE;
 -})
 -
 -;; Helper for vec_init.  Initialize element 0 of the output from the input.
 -;; All other elements are undefined.
 -(define_insn "loongson_vec_init1_<mode>"
 -  [(set (match_operand:VHB 0 "register_operand" "=f")
 -	(unspec:VHB [(truncate:<V_inner>
 -		       (match_operand:DI 1 "reg_or_0_operand" "Jd"))]
 -		    UNSPEC_LOONGSON_VINIT))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "dmtc1\t%z1,%0"
 -  [(set_attr "move_type" "mtc")
 -   (set_attr "mode" "DI")])
 -
 -;; Helper for vec_initv2si.
 -(define_insn "*vec_concatv2si"
 -  [(set (match_operand:V2SI 0 "register_operand" "=f")
 -	(vec_concat:V2SI
 -	  (match_operand:SI 1 "register_operand" "f")
 -	  (match_operand:SI 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "punpcklwd\t%0,%1,%2"
 -  [(set_attr "type" "fcvt")])
 -
 -;; Instruction patterns for SIMD instructions.
 -
 -;; Pack with signed saturation.
 -(define_insn "vec_pack_ssat_<mode>"
 -  [(set (match_operand:<V_squash_double> 0 "register_operand" "=f")
 -        (vec_concat:<V_squash_double>
 -	 (ss_truncate:<V_squash>
 -	  (match_operand:VWH 1 "register_operand" "f"))
 -	 (ss_truncate:<V_squash>
 -	  (match_operand:VWH 2 "register_operand" "f"))))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "packss<V_squash_double_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fmul")])
 -
 -;; Pack with unsigned saturation.
 -(define_insn "vec_pack_usat_<mode>"
 -  [(set (match_operand:<V_squash_double> 0 "register_operand" "=f")
 -        (vec_concat:<V_squash_double>
 -	 (us_truncate:<V_squash>
 -	  (match_operand:VH 1 "register_operand" "f"))
 -	 (us_truncate:<V_squash>
 -	  (match_operand:VH 2 "register_operand" "f"))))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "packus<V_squash_double_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fmul")])
 -
 -;; Addition, treating overflow by wraparound.
 -(define_insn "add<mode>3"
 -  [(set (match_operand:VWHB 0 "register_operand" "=f")
 -        (plus:VWHB (match_operand:VWHB 1 "register_operand" "f")
 -		   (match_operand:VWHB 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "padd<V_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fadd")])
 -
 -;; Addition of doubleword integers stored in FP registers.
 -;; Overflow is treated by wraparound.
 -;; We use 'unspec' instead of 'plus' here to avoid clash with
 -;; mips.md::add<mode>3.  If 'plus' was used, then such instruction
 -;; would be recognized as adddi3 and reload would make it use
 -;; GPRs instead of FPRs.
 -(define_insn "loongson_paddd"
 -  [(set (match_operand:DI 0 "register_operand" "=f")
 -        (unspec:DI [(match_operand:DI 1 "register_operand" "f")
 -		    (match_operand:DI 2 "register_operand" "f")]
 -		   UNSPEC_LOONGSON_PADDD))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "paddd\t%0,%1,%2"
 -  [(set_attr "type" "fadd")])
 -
 -;; Addition, treating overflow by signed saturation.
 -(define_insn "ssadd<mode>3"
 -  [(set (match_operand:VHB 0 "register_operand" "=f")
 -        (ss_plus:VHB (match_operand:VHB 1 "register_operand" "f")
 -		     (match_operand:VHB 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "padds<V_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fadd")])
 -
 -;; Addition, treating overflow by unsigned saturation.
 -(define_insn "usadd<mode>3"
 -  [(set (match_operand:VHB 0 "register_operand" "=f")
 -        (us_plus:VHB (match_operand:VHB 1 "register_operand" "f")
 -		     (match_operand:VHB 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "paddus<V_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fadd")])
 -
 -;; Logical AND NOT.
 -(define_insn "loongson_pandn_<V_suffix>"
 -  [(set (match_operand:VWHBDI 0 "register_operand" "=f")
 -        (and:VWHBDI
 -	 (not:VWHBDI (match_operand:VWHBDI 1 "register_operand" "f"))
 -	 (match_operand:VWHBDI 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pandn\t%0,%1,%2"
 -  [(set_attr "type" "fmul")])
 -
 -;; Logical AND.
 -(define_insn "and<mode>3"
 -  [(set (match_operand:VWHB 0 "register_operand" "=f")
 -	(and:VWHB (match_operand:VWHB 1 "register_operand" "f")
 -		  (match_operand:VWHB 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "and\t%0,%1,%2"
 -  [(set_attr "type" "fmul")])
 -
 -;; Logical OR.
 -(define_insn "ior<mode>3"
 -  [(set (match_operand:VWHB 0 "register_operand" "=f")
 -	(ior:VWHB (match_operand:VWHB 1 "register_operand" "f")
 -		  (match_operand:VWHB 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "or\t%0,%1,%2"
 -  [(set_attr "type" "fcvt")])
 -
 -;; Logical XOR.
 -(define_insn "xor<mode>3"
 -  [(set (match_operand:VWHB 0 "register_operand" "=f")
 -	(xor:VWHB (match_operand:VWHB 1 "register_operand" "f")
 -		  (match_operand:VWHB 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "xor\t%0,%1,%2"
 -  [(set_attr "type" "fmul")])
 -
 -;; Logical NOR.
 -(define_insn "*loongson_nor"
 -  [(set (match_operand:VWHB 0 "register_operand" "=f")
 -	(and:VWHB
 -	  (not:VWHB (match_operand:VWHB 1 "register_operand" "f"))
 -	  (not:VWHB (match_operand:VWHB 2 "register_operand" "f"))))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "nor\t%0,%1,%2"
 -  [(set_attr "type" "fmul")])
 -
 -;; Logical NOT.
 -(define_insn "one_cmpl<mode>2"
 -  [(set (match_operand:VWHB 0 "register_operand" "=f")
 -	(not:VWHB (match_operand:VWHB 1 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "nor\t%0,%1,%1"
 -  [(set_attr "type" "fmul")])
 -
 -;; Average.
 -(define_insn "loongson_pavg<V_suffix>"
 -  [(set (match_operand:VHB 0 "register_operand" "=f")
 -        (unspec:VHB [(match_operand:VHB 1 "register_operand" "f")
 -		     (match_operand:VHB 2 "register_operand" "f")]
 -		    UNSPEC_LOONGSON_PAVG))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pavg<V_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fadd")])
 -
 -;; Equality test.
 -(define_insn "loongson_pcmpeq<V_suffix>"
 -  [(set (match_operand:VWHB 0 "register_operand" "=f")
 -        (unspec:VWHB [(match_operand:VWHB 1 "register_operand" "f")
 -		      (match_operand:VWHB 2 "register_operand" "f")]
 -		     UNSPEC_LOONGSON_PCMPEQ))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pcmpeq<V_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fadd")])
 -
 -;; Greater-than test.
 -(define_insn "loongson_pcmpgt<V_suffix>"
 -  [(set (match_operand:VWHB 0 "register_operand" "=f")
 -        (unspec:VWHB [(match_operand:VWHB 1 "register_operand" "f")
 -		      (match_operand:VWHB 2 "register_operand" "f")]
 -		     UNSPEC_LOONGSON_PCMPGT))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pcmpgt<V_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fadd")])
 -
 -;; Extract halfword.
 -(define_insn "loongson_pextrh"
 -  [(set (match_operand:V4HI 0 "register_operand" "=f")
 -        (unspec:V4HI [(match_operand:V4HI 1 "register_operand" "f")
 -		      (match_operand:SI 2 "register_operand" "f")]
 -		   UNSPEC_LOONGSON_PEXTR))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pextrh\t%0,%1,%2"
 -  [(set_attr "type" "fcvt")])
 -
 -;; Insert halfword.
 -(define_insn "loongson_pinsrh_0"
 -  [(set (match_operand:V4HI 0 "register_operand" "=f")
 -	(vec_select:V4HI
 -	  (vec_concat:V8HI
 -	    (match_operand:V4HI 1 "register_operand" "f")
 -	    (match_operand:V4HI 2 "register_operand" "f"))
 -	  (parallel [(const_int 4) (const_int 1)
 -		     (const_int 2) (const_int 3)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pinsrh_0\t%0,%1,%2"
 -  [(set_attr "type" "fdiv")])
 -
 -(define_insn "loongson_pinsrh_1"
 -  [(set (match_operand:V4HI 0 "register_operand" "=f")
 -	(vec_select:V4HI
 -	  (vec_concat:V8HI
 -	    (match_operand:V4HI 1 "register_operand" "f")
 -	    (match_operand:V4HI 2 "register_operand" "f"))
 -	  (parallel [(const_int 0) (const_int 4)
 -		     (const_int 2) (const_int 3)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pinsrh_1\t%0,%1,%2"
 -  [(set_attr "type" "fdiv")])
 -
 -(define_insn "loongson_pinsrh_2"
 -  [(set (match_operand:V4HI 0 "register_operand" "=f")
 -	(vec_select:V4HI
 -	  (vec_concat:V8HI
 -	    (match_operand:V4HI 1 "register_operand" "f")
 -	    (match_operand:V4HI 2 "register_operand" "f"))
 -	  (parallel [(const_int 0) (const_int 1)
 -		     (const_int 4) (const_int 3)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pinsrh_2\t%0,%1,%2"
 -  [(set_attr "type" "fdiv")])
 -
 -(define_insn "loongson_pinsrh_3"
 -  [(set (match_operand:V4HI 0 "register_operand" "=f")
 -	(vec_select:V4HI
 -	  (vec_concat:V8HI
 -	    (match_operand:V4HI 1 "register_operand" "f")
 -	    (match_operand:V4HI 2 "register_operand" "f"))
 -	  (parallel [(const_int 0) (const_int 1)
 -		     (const_int 2) (const_int 4)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pinsrh_3\t%0,%1,%2"
 -  [(set_attr "type" "fdiv")])
 -
 -(define_insn "*vec_setv4hi"
 -  [(set (match_operand:V4HI 0 "register_operand" "=f")
 -	(unspec:V4HI [(match_operand:V4HI 1 "register_operand" "f")
 -		      (match_operand:SI 2 "register_operand" "f")
 -		      (match_operand:SI 3 "const_0_to_3_operand" "")]
 -		     UNSPEC_LOONGSON_PINSRH))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pinsrh_%3\t%0,%1,%2"
 -  [(set_attr "type" "fdiv")])
 -
 -(define_expand "vec_setv4hi"
 -  [(set (match_operand:V4HI 0 "register_operand" "=f")
 -	(unspec:V4HI [(match_operand:V4HI 1 "register_operand" "f")
 -		      (match_operand:HI 2 "register_operand" "f")
 -		      (match_operand:SI 3 "const_0_to_3_operand" "")]
 -		     UNSPEC_LOONGSON_PINSRH))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -{
 -  rtx ext = gen_reg_rtx (SImode);
 -  emit_move_insn (ext, gen_lowpart (SImode, operands[1]));
 -  operands[1] = ext;
 -})
 -
 -;; Multiply and add packed integers.
 -(define_insn "loongson_pmaddhw"
 -  [(set (match_operand:V2SI 0 "register_operand" "=f")
 -        (unspec:V2SI [(match_operand:V4HI 1 "register_operand" "f")
 -		      (match_operand:V4HI 2 "register_operand" "f")]
 -		     UNSPEC_LOONGSON_PMADD))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pmaddhw\t%0,%1,%2"
 -  [(set_attr "type" "fmul")])
 -
 -(define_expand "sdot_prodv4hi"
 -  [(match_operand:V2SI 0 "register_operand" "")
 -   (match_operand:V4HI 1 "register_operand" "")
 -   (match_operand:V4HI 2 "register_operand" "")
 -   (match_operand:V2SI 3 "register_operand" "")]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -{
 -  rtx t = gen_reg_rtx (V2SImode);
 -  emit_insn (gen_loongson_pmaddhw (t, operands[1], operands[2]));
 -  emit_insn (gen_addv2si3 (operands[0], t, operands[3]));
 -  DONE;
 -})
 -
 -;; Maximum of signed halfwords.
 -(define_insn "smaxv4hi3"
 -  [(set (match_operand:V4HI 0 "register_operand" "=f")
 -        (smax:V4HI (match_operand:V4HI 1 "register_operand" "f")
 -		   (match_operand:V4HI 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pmaxsh\t%0,%1,%2"
 -  [(set_attr "type" "fadd")])
 -
 -(define_expand "smax<mode>3"
 -  [(match_operand:VWB 0 "register_operand" "")
 -   (match_operand:VWB 1 "register_operand" "")
 -   (match_operand:VWB 2 "register_operand" "")]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -{
 -  mips_expand_vec_minmax (operands[0], operands[1], operands[2],
 -			  gen_loongson_pcmpgt<V_suffix>, false);
 -  DONE;
 -})
 -
 -;; Maximum of unsigned bytes.
 -(define_insn "umaxv8qi3"
 -  [(set (match_operand:V8QI 0 "register_operand" "=f")
 -        (umax:V8QI (match_operand:V8QI 1 "register_operand" "f")
 -		   (match_operand:V8QI 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pmaxub\t%0,%1,%2"
 -  [(set_attr "type" "fadd")])
 -
 -;; Minimum of signed halfwords.
 -(define_insn "sminv4hi3"
 -  [(set (match_operand:V4HI 0 "register_operand" "=f")
 -        (smin:V4HI (match_operand:V4HI 1 "register_operand" "f")
 -		   (match_operand:V4HI 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pminsh\t%0,%1,%2"
 -  [(set_attr "type" "fadd")])
 -
 -(define_expand "smin<mode>3"
 -  [(match_operand:VWB 0 "register_operand" "")
 -   (match_operand:VWB 1 "register_operand" "")
 -   (match_operand:VWB 2 "register_operand" "")]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -{
 -  mips_expand_vec_minmax (operands[0], operands[1], operands[2],
 -			  gen_loongson_pcmpgt<V_suffix>, true);
 -  DONE;
 -})
 -
 -;; Minimum of unsigned bytes.
 -(define_insn "uminv8qi3"
 -  [(set (match_operand:V8QI 0 "register_operand" "=f")
 -        (umin:V8QI (match_operand:V8QI 1 "register_operand" "f")
 -		   (match_operand:V8QI 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pminub\t%0,%1,%2"
 -  [(set_attr "type" "fadd")])
 -
 -;; Move byte mask.
 -(define_insn "loongson_pmovmsk<V_suffix>"
 -  [(set (match_operand:VB 0 "register_operand" "=f")
 -        (unspec:VB [(match_operand:VB 1 "register_operand" "f")]
 -		   UNSPEC_LOONGSON_PMOVMSK))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pmovmsk<V_suffix>\t%0,%1"
 -  [(set_attr "type" "fabs")])
 -
 -;; Multiply unsigned integers and store high result.
 -(define_insn "umul<mode>3_highpart"
 -  [(set (match_operand:VH 0 "register_operand" "=f")
 -        (unspec:VH [(match_operand:VH 1 "register_operand" "f")
 -		    (match_operand:VH 2 "register_operand" "f")]
 -		   UNSPEC_LOONGSON_PMULHU))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pmulhu<V_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fmul")])
 -
 -;; Multiply signed integers and store high result.
 -(define_insn "smul<mode>3_highpart"
 -  [(set (match_operand:VH 0 "register_operand" "=f")
 -        (unspec:VH [(match_operand:VH 1 "register_operand" "f")
 -		    (match_operand:VH 2 "register_operand" "f")]
 -		   UNSPEC_LOONGSON_PMULH))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pmulh<V_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fmul")])
 -
 -;; Multiply signed integers and store low result.
 -(define_insn "mul<mode>3"
 -  [(set (match_operand:VH 0 "register_operand" "=f")
 -        (mult:VH (match_operand:VH 1 "register_operand" "f")
 -                 (match_operand:VH 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pmull<V_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fmul")])
 -
 -;; Multiply unsigned word integers.
 -(define_insn "loongson_pmulu<V_suffix>"
 -  [(set (match_operand:DI 0 "register_operand" "=f")
 -        (unspec:DI [(match_operand:VW 1 "register_operand" "f")
 -		    (match_operand:VW 2 "register_operand" "f")]
 -		   UNSPEC_LOONGSON_PMULU))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pmulu<V_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fmul")])
 -
 -;; Absolute difference.
 -(define_insn "loongson_pasubub"
 -  [(set (match_operand:VB 0 "register_operand" "=f")
 -        (unspec:VB [(match_operand:VB 1 "register_operand" "f")
 -		    (match_operand:VB 2 "register_operand" "f")]
 -		   UNSPEC_LOONGSON_PASUBUB))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pasubub\t%0,%1,%2"
 -  [(set_attr "type" "fadd")])
 -
 -;; Sum of unsigned byte integers.
 -(define_insn "loongson_biadd"
 -  [(set (match_operand:<V_stretch_half> 0 "register_operand" "=f")
 -        (unspec:<V_stretch_half> [(match_operand:VB 1 "register_operand" "f")]
 -				 UNSPEC_LOONGSON_BIADD))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "biadd\t%0,%1"
 -  [(set_attr "type" "fabs")])
 -
 -(define_insn "reduc_uplus_v8qi"
 -  [(set (match_operand:V8QI 0 "register_operand" "=f")
 -	(unspec:V8QI [(match_operand:V8QI 1 "register_operand" "f")]
 -		     UNSPEC_LOONGSON_BIADD))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "biadd\t%0,%1"
 -  [(set_attr "type" "fabs")])
 -
 -;; Sum of absolute differences.
 -(define_insn "loongson_psadbh"
 -  [(set (match_operand:<V_stretch_half> 0 "register_operand" "=f")
 -        (unspec:<V_stretch_half> [(match_operand:VB 1 "register_operand" "f")
 -				  (match_operand:VB 2 "register_operand" "f")]
 -				 UNSPEC_LOONGSON_PSADBH))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pasubub\t%0,%1,%2;biadd\t%0,%0"
 -  [(set_attr "type" "fadd")])
 -
 -;; Shuffle halfwords.
 -(define_insn "loongson_pshufh"
 -  [(set (match_operand:VH 0 "register_operand" "=f")
 -        (unspec:VH [(match_operand:VH 1 "register_operand" "f")
 -		    (match_operand:SI 2 "register_operand" "f")]
 -		   UNSPEC_LOONGSON_PSHUFH))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "pshufh\t%0,%1,%2"
 -  [(set_attr "type" "fmul")])
 -
 -;; Shift left logical.
 -(define_insn "ashl<mode>3"
 -  [(set (match_operand:VWH 0 "register_operand" "=f")
 -        (ashift:VWH (match_operand:VWH 1 "register_operand" "f")
 -		    (match_operand:SI 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "psll<V_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fcvt")])
 -
 -;; Shift right arithmetic.
 -(define_insn "ashr<mode>3"
 -  [(set (match_operand:VWH 0 "register_operand" "=f")
 -        (ashiftrt:VWH (match_operand:VWH 1 "register_operand" "f")
 -		      (match_operand:SI 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "psra<V_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fcvt")])
 -
 -;; Shift right logical.
 -(define_insn "lshr<mode>3"
 -  [(set (match_operand:VWH 0 "register_operand" "=f")
 -        (lshiftrt:VWH (match_operand:VWH 1 "register_operand" "f")
 -		      (match_operand:SI 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "psrl<V_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fcvt")])
 -
 -;; Subtraction, treating overflow by wraparound.
 -(define_insn "sub<mode>3"
 -  [(set (match_operand:VWHB 0 "register_operand" "=f")
 -        (minus:VWHB (match_operand:VWHB 1 "register_operand" "f")
 -		    (match_operand:VWHB 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "psub<V_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fadd")])
 -
 -;; Subtraction of doubleword integers stored in FP registers.
 -;; Overflow is treated by wraparound.
 -;; See loongson_paddd for the reason we use 'unspec' rather than
 -;; 'minus' here.
 -(define_insn "loongson_psubd"
 -  [(set (match_operand:DI 0 "register_operand" "=f")
 -        (unspec:DI [(match_operand:DI 1 "register_operand" "f")
 -		    (match_operand:DI 2 "register_operand" "f")]
 -		   UNSPEC_LOONGSON_PSUBD))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "psubd\t%0,%1,%2"
 -  [(set_attr "type" "fadd")])
 -
 -;; Subtraction, treating overflow by signed saturation.
 -(define_insn "sssub<mode>3"
 -  [(set (match_operand:VHB 0 "register_operand" "=f")
 -        (ss_minus:VHB (match_operand:VHB 1 "register_operand" "f")
 -		      (match_operand:VHB 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "psubs<V_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fadd")])
 -
 -;; Subtraction, treating overflow by unsigned saturation.
 -(define_insn "ussub<mode>3"
 -  [(set (match_operand:VHB 0 "register_operand" "=f")
 -        (us_minus:VHB (match_operand:VHB 1 "register_operand" "f")
 -		      (match_operand:VHB 2 "register_operand" "f")))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "psubus<V_suffix>\t%0,%1,%2"
 -  [(set_attr "type" "fadd")])
 -
 -;; Unpack high data.  Recall that Loongson only runs in little-endian.
 -(define_insn "loongson_punpckhbh"
 -  [(set (match_operand:V8QI 0 "register_operand" "=f")
 -	(vec_select:V8QI
 -	  (vec_concat:V16QI
 -	    (match_operand:V8QI 1 "register_operand" "f")
 -	    (match_operand:V8QI 2 "register_operand" "f"))
 -	  (parallel [(const_int 4) (const_int 12)
 -		     (const_int 5) (const_int 13)
 -		     (const_int 6) (const_int 14)
 -		     (const_int 7) (const_int 15)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "punpckhbh\t%0,%1,%2"
 -  [(set_attr "type" "fdiv")])
 -
 -(define_insn "loongson_punpckhhw"
 -  [(set (match_operand:V4HI 0 "register_operand" "=f")
 -	(vec_select:V4HI
 -	  (vec_concat:V8HI
 -	    (match_operand:V4HI 1 "register_operand" "f")
 -	    (match_operand:V4HI 2 "register_operand" "f"))
 -	  (parallel [(const_int 2) (const_int 6)
 -		     (const_int 3) (const_int 7)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "punpckhhw\t%0,%1,%2"
 -  [(set_attr "type" "fdiv")])
 -
 -(define_insn "loongson_punpckhhw_qi"
 -  [(set (match_operand:V8QI 0 "register_operand" "=f")
 -	(vec_select:V8QI
 -	  (vec_concat:V16QI
 -	    (match_operand:V8QI 1 "register_operand" "f")
 -	    (match_operand:V8QI 2 "register_operand" "f"))
 -	  (parallel [(const_int 4)  (const_int 5)
 -		     (const_int 12) (const_int 13)
 -		     (const_int 6)  (const_int 7)
 -		     (const_int 14) (const_int 15)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "punpckhhw\t%0,%1,%2"
 -  [(set_attr "type" "fdiv")])
 -
 -(define_insn "loongson_punpckhwd"
 -  [(set (match_operand:V2SI 0 "register_operand" "=f")
 -	(vec_select:V2SI
 -	  (vec_concat:V4SI
 -	    (match_operand:V2SI 1 "register_operand" "f")
 -	    (match_operand:V2SI 2 "register_operand" "f"))
 -	  (parallel [(const_int 1) (const_int 3)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "punpckhwd\t%0,%1,%2"
 -  [(set_attr "type" "fcvt")])
 -
 -(define_insn "loongson_punpckhwd_qi"
 -  [(set (match_operand:V8QI 0 "register_operand" "=f")
 -	(vec_select:V8QI
 -	  (vec_concat:V16QI
 -	    (match_operand:V8QI 1 "register_operand" "f")
 -	    (match_operand:V8QI 2 "register_operand" "f"))
 -	  (parallel [(const_int 4) (const_int 5)
 -		     (const_int 6) (const_int 7)
 -		     (const_int 12) (const_int 13)
 -		     (const_int 14) (const_int 15)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "punpckhwd\t%0,%1,%2"
 -  [(set_attr "type" "fcvt")])
 -
 -(define_insn "loongson_punpckhwd_hi"
 -  [(set (match_operand:V4HI 0 "register_operand" "=f")
 -	(vec_select:V4HI
 -	  (vec_concat:V8HI
 -	    (match_operand:V4HI 1 "register_operand" "f")
 -	    (match_operand:V4HI 2 "register_operand" "f"))
 -	  (parallel [(const_int 2) (const_int 3)
 -		     (const_int 6) (const_int 7)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "punpckhwd\t%0,%1,%2"
 -  [(set_attr "type" "fcvt")])
 -
 -;; Unpack low data.
 -(define_insn "loongson_punpcklbh"
 -  [(set (match_operand:V8QI 0 "register_operand" "=f")
 -	(vec_select:V8QI
 -	  (vec_concat:V16QI
 -	    (match_operand:V8QI 1 "register_operand" "f")
 -	    (match_operand:V8QI 2 "register_operand" "f"))
 -	  (parallel [(const_int 0) (const_int 8)
 -		     (const_int 1) (const_int 9)
 -		     (const_int 2) (const_int 10)
 -		     (const_int 3) (const_int 11)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "punpcklbh\t%0,%1,%2"
 -  [(set_attr "type" "fdiv")])
 -
 -(define_insn "loongson_punpcklhw"
 -  [(set (match_operand:V4HI 0 "register_operand" "=f")
 -	(vec_select:V4HI
 -	  (vec_concat:V8HI
 -	    (match_operand:V4HI 1 "register_operand" "f")
 -	    (match_operand:V4HI 2 "register_operand" "f"))
 -	  (parallel [(const_int 0) (const_int 4)
 -		     (const_int 1) (const_int 5)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "punpcklhw\t%0,%1,%2"
 -  [(set_attr "type" "fdiv")])
 -
 -(define_insn "*loongson_punpcklhw_qi"
 -  [(set (match_operand:V8QI 0 "register_operand" "=f")
 -	(vec_select:V8QI
 -	  (vec_concat:V16QI
 -	    (match_operand:V8QI 1 "register_operand" "f")
 -	    (match_operand:V8QI 2 "register_operand" "f"))
 -	  (parallel [(const_int 0)  (const_int 1)
 -		     (const_int 8)  (const_int 9)
 -		     (const_int 2)  (const_int 3)
 -		     (const_int 10) (const_int 11)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "punpcklhw\t%0,%1,%2"
 -  [(set_attr "type" "fdiv")])
 -
 -(define_insn "loongson_punpcklwd"
 -  [(set (match_operand:V2SI 0 "register_operand" "=f")
 -	(vec_select:V2SI
 -	  (vec_concat:V4SI
 -	    (match_operand:V2SI 1 "register_operand" "f")
 -	    (match_operand:V2SI 2 "register_operand" "f"))
 -	  (parallel [(const_int 0) (const_int 2)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "punpcklwd\t%0,%1,%2"
 -  [(set_attr "type" "fcvt")])
 -
 -(define_insn "*loongson_punpcklwd_qi"
 -  [(set (match_operand:V8QI 0 "register_operand" "=f")
 -	(vec_select:V8QI
 -	  (vec_concat:V16QI
 -	    (match_operand:V8QI 1 "register_operand" "f")
 -	    (match_operand:V8QI 2 "register_operand" "f"))
 -	  (parallel [(const_int 0) (const_int 1)
 -		     (const_int 2) (const_int 3)
 -		     (const_int 8) (const_int 9)
 -		     (const_int 10) (const_int 11)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "punpcklwd\t%0,%1,%2"
 -  [(set_attr "type" "fcvt")])
 -
 -(define_insn "*loongson_punpcklwd_hi"
 -  [(set (match_operand:V4HI 0 "register_operand" "=f")
 -	(vec_select:V4HI
 -	  (vec_concat:V8HI
 -	    (match_operand:V4HI 1 "register_operand" "f")
 -	    (match_operand:V4HI 2 "register_operand" "f"))
 -	  (parallel [(const_int 0) (const_int 1)
 -		     (const_int 4) (const_int 5)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "punpcklwd\t%0,%1,%2"
 -  [(set_attr "type" "fcvt")])
 -
 -(define_expand "vec_unpacks_lo_<mode>"
 -  [(match_operand:<V_stretch_half> 0 "register_operand" "")
 -   (match_operand:VHB 1 "register_operand" "")]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -{
 -  mips_expand_vec_unpack (operands, false, false);
 -  DONE;
 -})
 -
 -(define_expand "vec_unpacks_hi_<mode>"
 -  [(match_operand:<V_stretch_half> 0 "register_operand" "")
 -   (match_operand:VHB 1 "register_operand" "")]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -{
 -  mips_expand_vec_unpack (operands, false, true);
 -  DONE;
 -})
 -
 -(define_expand "vec_unpacku_lo_<mode>"
 -  [(match_operand:<V_stretch_half> 0 "register_operand" "")
 -   (match_operand:VHB 1 "register_operand" "")]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -{
 -  mips_expand_vec_unpack (operands, true, false);
 -  DONE;
 -})
 -
 -(define_expand "vec_unpacku_hi_<mode>"
 -  [(match_operand:<V_stretch_half> 0 "register_operand" "")
 -   (match_operand:VHB 1 "register_operand" "")]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -{
 -  mips_expand_vec_unpack (operands, true, true);
 -  DONE;
 -})
 -
 -;; Whole vector shifts, used for reduction epilogues.
 -(define_insn "vec_shl_<mode>"
 -  [(set (match_operand:VWHBDI 0 "register_operand" "=f")
 -        (unspec:VWHBDI [(match_operand:VWHBDI 1 "register_operand" "f")
 -                        (match_operand:SI 2 "register_operand" "f")]
 -                       UNSPEC_LOONGSON_DSLL))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "dsll\t%0,%1,%2"
 -  [(set_attr "type" "fcvt")])
 -
 -(define_insn "vec_shr_<mode>"
 -  [(set (match_operand:VWHBDI 0 "register_operand" "=f")
 -        (unspec:VWHBDI [(match_operand:VWHBDI 1 "register_operand" "f")
 -                        (match_operand:SI 2 "register_operand" "f")]
 -                       UNSPEC_LOONGSON_DSRL))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "dsrl\t%0,%1,%2"
 -  [(set_attr "type" "fcvt")])
 -
 -;; FOR G3A 128-bit mem access instructions
 -;; gssq and gslq define_insn pattern
 -(define_insn "movsqdi_internal"
 -  [(parallel [(set (match_operand:DI 0 "stack_operand" "")
 -                   (match_operand:DI 1 "register_operand" ""))
 -              (set (match_operand:DI 2 "stack_operand" "")
 -                   (match_operand:DI 3 "register_operand" ""))])]
 -  "mips_gs464_128_store_p(operands)"
 -  {
 -    return "gssq\t%1,%3,%2";
 -  }
 -  [(set_attr "length" "8")
 -   (set_attr "can_delay" "no")])
 -
 -(define_insn "movsqdf_internal"
 -  [(parallel [(set (match_operand:DF 0 "stack_operand" "")
 -                   (match_operand:DF 1 "register_operand" ""))
 -              (set (match_operand:DF 2 "stack_operand" "")
 -                   (match_operand:DF 3 "register_operand" ""))])]
 -  "mips_gs464_128_store_p(operands)"
 -  {
 -    return "gssqc1\t%1,%3,%2";
 -  }
 -  [(set_attr "length" "8")
 -   (set_attr "can_delay" "no")])
 -
 -(define_insn "movlqdi_internal"
 -  [(parallel [(set (match_operand:DI 0 "register_operand" "")
 -                   (match_operand:DI 1 "stack_operand" ""))
 -              (set (match_operand:DI 2 "register_operand" "")
 -                   (match_operand:DI 3 "stack_operand" ""))])]
 -  "mips_gs464_128_load_p(operands)"
 -  {
 -    return "gslq\t%0,%2,%3";
 -  }
 -  [(set_attr "length" "8")
 -   (set_attr "can_delay" "no")])
 -
 -(define_insn "movlqdf_internal"
 -  [(parallel [(set (match_operand:DF 0 "register_operand" "")
 -                   (match_operand:DF 1 "stack_operand" ""))
 -              (set (match_operand:DF 2 "register_operand" "")
 -                   (match_operand:DF 3 "stack_operand" ""))])]
 -  "mips_gs464_128_load_p(operands)"
 -  {
 -    return "gslqc1\t%0,%2,%3";
 -  }
 -  [(set_attr "length" "8")
 -   (set_attr "can_delay" "no")])
 -
 -;;for insn_and_split template sign extend
 -(define_insn "gsdmul3di"
 -  [(set (match_operand:DI 0 "register_operand" "=d")
 -        (mult:DI (sign_extend:DI (match_operand:SI 1 "register_operand" "d"))
 -                  (sign_extend: DI (match_operand:SI 2 "register_operand" "d"))))]
 -  "TARGET_LOONGSON_3A && TARGET_64BIT"
 -  "gsdmult\t%0,%1,%2"
 -  [(set_attr "type" "imul3nc")
 -   (set_attr "mode" "SI")])
 -
 -;;for insn_and_split template zero extend
 -(define_insn "ugsdmul3di"
 -  [(set (match_operand:DI 0 "register_operand" "=d")
 -        (mult:DI (zero_extend:DI (match_operand:SI 1 "register_operand" "d"))
 -                  (zero_extend: DI (match_operand:SI 2 "register_operand" "d"))))]
 -  "TARGET_LOONGSON_3A && TARGET_64BIT"
 -  "gsdmultu\t%0,%1,%2"
 -  [(set_attr "type" "imul3nc")
 -   (set_attr "mode" "SI")])
 -
 -(define_insn "vec_loongson_extract_lo_<mode>"
 -  [(set (match_operand:<V_inner> 0 "register_operand" "=r")
 -        (vec_select:<V_inner>
 -          (match_operand:VWHB 1 "register_operand" "f")
 -          (parallel [(const_int 0)])))]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -  "mfc1\t%0,%1"
 -  [(set_attr "type" "mfc")])
 -
 -(define_expand "reduc_plus_scal_<mode>"
 -  [(match_operand:<V_inner> 0 "register_operand" "")
 -   (match_operand:VWHB 1 "register_operand" "")]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -{
 -  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
 -  mips_expand_vec_reduc (tmp, operands[1], gen_add<mode>3);
 -  emit_insn (gen_vec_loongson_extract_lo_<mode> (operands[0], tmp));
 -  DONE;
 -})
 -
 -(define_expand "reduc_smax_scal_<mode>"
 -  [(match_operand:<V_inner> 0 "register_operand" "")
 -   (match_operand:VWHB 1 "register_operand" "")]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -{
 -  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
 -  mips_expand_vec_reduc (tmp, operands[1], gen_smax<mode>3);
 -  emit_insn (gen_vec_loongson_extract_lo_<mode> (operands[0], tmp));
 -  DONE;
 -})
 -
 -(define_expand "reduc_smin_scal_<mode>"
 -  [(match_operand:<V_inner> 0 "register_operand" "")
 -   (match_operand:VWHB 1 "register_operand" "")]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -{
 -  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
 -  mips_expand_vec_reduc (tmp, operands[1], gen_smin<mode>3);
 -  emit_insn (gen_vec_loongson_extract_lo_<mode> (operands[0], tmp));
 -  DONE;
 -})
 -
 -(define_expand "reduc_umax_scal_<mode>"
 -  [(match_operand:<V_inner> 0 "register_operand" "")
 -   (match_operand:VB 1 "register_operand" "")]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -{
 -  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
 -  mips_expand_vec_reduc (tmp, operands[1], gen_umax<mode>3);
 -  emit_insn (gen_vec_loongson_extract_lo_<mode> (operands[0], tmp));
 -  DONE;
 -})
 -
 -(define_expand "reduc_umin_scal_<mode>"
 -  [(match_operand:<V_inner> 0 "register_operand" "")
 -   (match_operand:VB 1 "register_operand" "")]
 -  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 -{
 -  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
 -  mips_expand_vec_reduc (tmp, operands[1], gen_umin<mode>3);
 -  emit_insn (gen_vec_loongson_extract_lo_<mode> (operands[0], tmp));
 -  DONE;
 -})
 diff --git a/gcc/config/mips/loongson3a.md b/gcc/config/mips/loongson3a.md
 deleted file mode 100644
 index 2ebde68..0000000
 --- a/gcc/config/mips/loongson3a.md
 +++ /dev/null
 @@ -1,137 +0,0 @@
 -;; Pipeline model for Loongson-3A cores.
 -
 -;; Copyright (C) 2011-2018 Free Software Foundation, Inc.
 -;;
 -;; This file is part of GCC.
 -;;
 -;; GCC is free software; you can redistribute it and/or modify it
 -;; under the terms of the GNU General Public License as published
 -;; by the Free Software Foundation; either version 3, or (at your
 -;; option) any later version.
 -;;
 -;; GCC is distributed in the hope that it will be useful, but WITHOUT
 -;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 -;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
 -;; License for more details.
 -;;
 -;; You should have received a copy of the GNU General Public License
 -;; along with GCC; see the file COPYING3.  If not see
 -;; <http://www.gnu.org/licenses/>.
 -
 -;; Uncomment the following line to output automata for debugging.
 -;; (automata_option "v")
 -
 -;; Automaton for integer instructions.
 -(define_automaton "ls3a_a_alu")
 -
 -;; Automaton for floating-point instructions.
 -(define_automaton "ls3a_a_falu")
 -
 -;; Automaton for memory operations.
 -(define_automaton "ls3a_a_mem")
 -
 -;; Describe the resources.
 -
 -(define_cpu_unit "ls3a_alu1" "ls3a_a_alu")
 -(define_cpu_unit "ls3a_alu2" "ls3a_a_alu")
 -(define_cpu_unit "ls3a_mem" "ls3a_a_mem")
 -(define_cpu_unit "ls3a_falu1" "ls3a_a_falu")
 -(define_cpu_unit "ls3a_falu2" "ls3a_a_falu")
 -
 -;; Describe instruction reservations.
 -
 -(define_insn_reservation "ls3a_arith" 1
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (eq_attr "type" "arith,clz,const,logical,
 -                        move,nop,shift,signext,slt"))
 -  "ls3a_alu1 | ls3a_alu2")
 -
 -(define_insn_reservation "ls3a_branch" 1
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (eq_attr "type" "branch,jump,call,condmove,trap"))
 -  "ls3a_alu1")
 -
 -(define_insn_reservation "ls3a_mfhilo" 1
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (eq_attr "type" "mfhi,mflo,mthi,mtlo"))
 -  "ls3a_alu2")
 -
 -;; Operation imul3nc is fully pipelined.
 -(define_insn_reservation "ls3a_imul3nc" 5
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (eq_attr "type" "imul3nc"))
 -  "ls3a_alu2")
 - 
 -(define_insn_reservation "ls3a_imul" 7
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (eq_attr "type" "imul,imadd"))
 -  "ls3a_alu2 * 7")
 - 
 -(define_insn_reservation "ls3a_idiv_si" 12
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (and (eq_attr "type" "idiv")
 -            (eq_attr "mode" "SI")))
 -  "ls3a_alu2 * 12")
 -
 -(define_insn_reservation "ls3a_idiv_di" 25
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (and (eq_attr "type" "idiv")
 -            (eq_attr "mode" "DI")))
 -  "ls3a_alu2 * 25")
 -
 -(define_insn_reservation "ls3a_load" 3
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (eq_attr "type" "load"))
 -  "ls3a_mem")
 - 
 -(define_insn_reservation "ls3a_fpload" 4
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (eq_attr "type" "load,mfc,mtc"))
 -  "ls3a_mem")
 -
 -(define_insn_reservation "ls3a_prefetch" 0
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (eq_attr "type" "prefetch,prefetchx"))
 -  "ls3a_mem")
 - 
 -(define_insn_reservation "ls3a_store" 0
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (eq_attr "type" "store,fpstore,fpidxstore"))
 -  "ls3a_mem")
 -
 -;; All the fp operations can be executed in FALU1.  Only fp add,
 -;; sub, mul, madd can be executed in FALU2.  Try FALU2 firstly.
 -(define_insn_reservation "ls3a_fadd" 6
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (eq_attr "type" "fadd,fmul,fmadd"))
 -  "ls3a_falu2 | ls3a_falu1")
 -
 -(define_insn_reservation "ls3a_fcmp" 2
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (eq_attr "type" "fabs,fcmp,fmove,fneg"))
 -  "ls3a_falu1")
 -
 -(define_insn_reservation "ls3a_fcvt" 4
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (eq_attr "type" "fcvt"))
 -  "ls3a_falu1")
 -
 -(define_insn_reservation "ls3a_fdiv_sf" 12
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (and (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt")
 -            (eq_attr "mode" "SF")))
 -  "ls3a_falu1 * 12")
 - 
 -(define_insn_reservation "ls3a_fdiv_df" 19
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (and (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt")
 -            (eq_attr "mode" "DF")))
 -  "ls3a_falu1 * 19")
 -
 -;; Force single-dispatch for unknown or multi.
 -(define_insn_reservation "ls3a_unknown" 1
 -  (and (eq_attr "cpu" "loongson_3a")
 -       (eq_attr "type" "unknown,multi,atomic,syncloop"))
 -  "ls3a_alu1 + ls3a_alu2 + ls3a_falu1 + ls3a_falu2 + ls3a_mem")
 -
 -;; End of DFA-based pipeline description for loongson_3a
 diff --git a/gcc/config/mips/mips-cpus.def b/gcc/config/mips/mips-cpus.def
 index d0640e5..e055117 100644
 --- a/gcc/config/mips/mips-cpus.def
 +++ b/gcc/config/mips/mips-cpus.def
 @@ -162,7 +162,10 @@ MIPS_CPU ("sr71000", PROCESSOR_SR71000, 64, PTF_AVOID_BRANCHLIKELY_SPEED)
 MIPS_CPU ("xlr", PROCESSOR_XLR, 64, PTF_AVOID_BRANCHLIKELY_SPEED)
 
 /* MIPS64 Release 2 processors.  */
 -MIPS_CPU ("loongson3a", PROCESSOR_LOONGSON_3A, 65, PTF_AVOID_BRANCHLIKELY_SPEED)
 +MIPS_CPU ("loongson3a", PROCESSOR_GS464, 65, PTF_AVOID_BRANCHLIKELY_SPEED)
 +MIPS_CPU ("gs464", PROCESSOR_GS464, 65, PTF_AVOID_BRANCHLIKELY_SPEED)
 +MIPS_CPU ("gs464e", PROCESSOR_GS464E, 65, PTF_AVOID_BRANCHLIKELY_SPEED)
 +MIPS_CPU ("gs264e", PROCESSOR_GS264E, 65, PTF_AVOID_BRANCHLIKELY_SPEED)
 MIPS_CPU ("octeon", PROCESSOR_OCTEON, 65, PTF_AVOID_BRANCHLIKELY_SPEED)
 MIPS_CPU ("octeon+", PROCESSOR_OCTEON, 65, PTF_AVOID_BRANCHLIKELY_SPEED)
 MIPS_CPU ("octeon2", PROCESSOR_OCTEON2, 65, PTF_AVOID_BRANCHLIKELY_SPEED)
 diff --git a/gcc/config/mips/mips-tables.opt b/gcc/config/mips/mips-tables.opt
 index daccefb..461881d 100644
 --- a/gcc/config/mips/mips-tables.opt
 +++ b/gcc/config/mips/mips-tables.opt
 @@ -679,20 +679,28 @@ EnumValue
 Enum(mips_arch_opt_value) String(loongson3a) Value(96) Canonical
 
 EnumValue
 -Enum(mips_arch_opt_value) String(octeon) Value(97) Canonical
 +Enum(mips_arch_opt_value) String(gs464) Value(97) Canonical
 
 EnumValue
 -Enum(mips_arch_opt_value) String(octeon+) Value(98) Canonical
 +Enum(mips_arch_opt_value) String(octeon) Value(98) Canonical
 
 EnumValue
 -Enum(mips_arch_opt_value) String(octeon2) Value(99) Canonical
 +Enum(mips_arch_opt_value) String(octeon+) Value(99) Canonical
 
 EnumValue
 -Enum(mips_arch_opt_value) String(octeon3) Value(100) Canonical
 +Enum(mips_arch_opt_value) String(octeon2) Value(100) Canonical
 
 EnumValue
 -Enum(mips_arch_opt_value) String(xlp) Value(101) Canonical
 +Enum(mips_arch_opt_value) String(octeon3) Value(101) Canonical
 
 EnumValue
 -Enum(mips_arch_opt_value) String(i6400) Value(102) Canonical
 +Enum(mips_arch_opt_value) String(xlp) Value(102) Canonical
 
 +EnumValue
 +Enum(mips_arch_opt_value) String(i6400) Value(103) Canonical
 +
 +EnumValue
 +Enum(mips_arch_opt_value) String(i6500) Value(104) Canonical
 +
 +EnumValue
 +Enum(mips_arch_opt_value) String(p6600) Value(105) Canonical
 diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
 index 9daaaaa..9ce7d14 100644
 --- a/gcc/config/mips/mips.c
 +++ b/gcc/config/mips/mips.c
 @@ -826,7 +826,13 @@ static const struct mips_rtx_cost_data
   { /* Loongson-2F */
     DEFAULT_COSTS
   },
 -  { /* Loongson-3A */
 +  { /* Loongson gs464.  */
 +    DEFAULT_COSTS
 +  },
 +  { /* Loongson gs464e.  */
 +    DEFAULT_COSTS
 +  },
 +  { /* Loongson gs264e.  */
     DEFAULT_COSTS
   },
   { /* M4k */
 @@ -11646,7 +11652,7 @@ mips_for_each_saved_gpr_and_fpr (HOST_WIDE_INT sp_offset,
 
   /* The loongson3a gs464 gss<l>q[c1] instructions offset has 9+4 bit equal to 4096
    * Option -mno-gs464-func-save-restore-reg disable this. */
 -  if(flag_sr_opt && TARGET_LOONGSON_3A
 +  if(flag_sr_opt && TARGET_LOONGSON_EXT
      && TARGET_64BIT && !ABI_32 && (offset < 4096))
   {/* FIXME: ABI */
     for (regno = GP_REG_LAST; regno >= GP_REG_FIRST; regno--)
 @@ -11709,7 +11715,7 @@ mips_for_each_saved_gpr_and_fpr (HOST_WIDE_INT sp_offset,
   offset = cfun->machine->frame.fp_sp_offset - sp_offset;
   fpr_mode = (TARGET_SINGLE_FLOAT ? SFmode : DFmode);
   save_regno1 = save_regno2 = 0;
 -  if(flag_sr_opt && TARGET_LOONGSON_3A && TARGET_FLOAT64
 +  if(flag_sr_opt && TARGET_LOONGSON_EXT && TARGET_FLOAT64
       && !ABI_32 && (fpr_mode == DFmode) && (offset < 4096))
   {
     for (regno = FP_REG_LAST - MAX_FPRS_PER_FMT + 1;
 @@ -12921,8 +12927,9 @@ mips_hard_regno_mode_ok_uncached (unsigned int regno, machine_mode mode)
       if (mode == CCFmode)
 	return !(TARGET_FLOATXX && (regno & 1) != 0);
 
 -      /* Allow 64-bit vector modes for Loongson-2E/2F.  */
 -      if (TARGET_LOONGSON_VECTORS
 +      /* Allow 64-bit vector modes for Loongson MultiMedia extensions
 +	 Instructions (MMI).  */
 +      if (TARGET_LOONGSON_MMI
 	  && (mode == V2SImode
 	      || mode == V4HImode
 	      || mode == V8QImode
 @@ -13492,7 +13499,7 @@ mips_vector_mode_supported_p (machine_mode mode)
     case E_V2SImode:
     case E_V4HImode:
     case E_V8QImode:
 -      return TARGET_LOONGSON_VECTORS;
 +      return TARGET_LOONGSON_MMI;
 
     default:
       return MSA_SUPPORTED_MODE_P (mode);
 @@ -14245,7 +14252,7 @@ mips_process_sync_loop (rtx_insn *insn, rtx *operands)
 
   /* Output the release side of the memory barrier.  */
   /* The loongson3a need sync after label "1:", disable this */
 -  if (need_atomic_barrier_p (model, true) && ! TARGET_LOONGSON_3A)
 +  if (need_atomic_barrier_p (model, true) && ! TARGET_LOONGSON_EXT)
     {
       if (required_oldval == 0 && TARGET_OCTEON)
 	{
 @@ -14379,7 +14386,7 @@ mips_process_sync_loop (rtx_insn *insn, rtx *operands)
 
   /* Output the acquire side of the memory barrier.  */
   /* The loongson3a need sync after label "2:", disable this */
 -  if (TARGET_SYNC_AFTER_SC && need_atomic_barrier_p (model, false) && ! TARGET_LOONGSON_3A)
 +  if (TARGET_SYNC_AFTER_SC && need_atomic_barrier_p (model, false) && ! TARGET_LOONGSON_EXT)
     mips_multi_add_insn ("sync", NULL);
 
   /* Output the exit label, if needed.  */
 @@ -14734,6 +14741,7 @@ mips_issue_rate (void)
     case PROCESSOR_OCTEON2:
     case PROCESSOR_OCTEON3:
     case PROCESSOR_I6400:
 +    case PROCESSOR_GS264E:
       return 2;
 
     case PROCESSOR_SB1:
 @@ -14746,7 +14754,8 @@ mips_issue_rate (void)
 
     case PROCESSOR_LOONGSON_2E:
     case PROCESSOR_LOONGSON_2F:
 -    case PROCESSOR_LOONGSON_3A:
 +    case PROCESSOR_GS464:
 +    case PROCESSOR_GS464E:
     case PROCESSOR_P5600:
       return 4;
 
 @@ -14877,10 +14886,10 @@ mips_multipass_dfa_lookahead (void)
   if (TUNE_SB1)
     return 4;
 
 -  if (TUNE_LOONGSON_2EF || TUNE_LOONGSON_3A)
 +  if (TUNE_LOONGSON_2EF || TUNE_GS464 || TUNE_GS464E)
     return 4;
 
 -  if (TUNE_OCTEON)
 +  if (TUNE_OCTEON || TUNE_GS264E)
     return 2;
 
   if (TUNE_P5600 || TUNE_I6400)
 @@ -15335,7 +15344,7 @@ AVAIL_NON_MIPS16 (dspr2, TARGET_DSPR2)
 AVAIL_NON_MIPS16 (dsp_32, !TARGET_64BIT && TARGET_DSP)
 AVAIL_NON_MIPS16 (dsp_64, TARGET_64BIT && TARGET_DSP)
 AVAIL_NON_MIPS16 (dspr2_32, !TARGET_64BIT && TARGET_DSPR2)
 -AVAIL_NON_MIPS16 (loongson, TARGET_LOONGSON_VECTORS)
 +AVAIL_NON_MIPS16 (loongson, TARGET_LOONGSON_MMI)
 AVAIL_NON_MIPS16 (cache, TARGET_CACHE_BUILTIN)
 AVAIL_NON_MIPS16 (msa, TARGET_MSA)
 
 @@ -20345,6 +20354,43 @@ mips_option_override (void)
       TARGET_DSPR2 = false;
     }
 
 +  /* Make sure that when TARGET_LOONGSON_MMI is true, TARGET_HARD_FLOAT_ABI
 +     is true.  In o32 pairs of floating-point registers provide 64-bit
 +     values.  */
 +  if (TARGET_LOONGSON_MMI &&  !TARGET_HARD_FLOAT_ABI)
 +    error ("%<-mloongson-mmi%> must be used with %<-mhard-float%>");
 +
 +  /* Default to enable Loongson MMI on Longson 2e, 2f, gs464, gs464e
 +   * or gs264e target.  */
 +  if ((target_flags_explicit & MASK_LOONGSON_MMI) == 0
 +      && ((strcmp (mips_arch_info->name, "loongson2e") == 0)
 +	  || (strcmp (mips_arch_info->name, "loongson2f") == 0)
 +	  || (strcmp (mips_arch_info->name, "loongson3a") == 0)
 +          || (strcmp (mips_arch_info->name, "gs464") == 0)
 +          || (strcmp (mips_arch_info->name, "gs464e") == 0)
 +          || (strcmp (mips_arch_info->name, "gs264e") == 0)))
 +    target_flags |= MASK_LOONGSON_MMI;
 +
 +  /* Default to enable Loongson EXT on Longson gs464, gs464e
 +   * or gs264e target.  */
 +   if ((target_flags_explicit & MASK_LOONGSON_EXT) == 0
 +      && ((strcmp (mips_arch_info->name, "loongson3a") == 0)
 +         || (strcmp (mips_arch_info->name, "gs464") == 0)
 +	 || (strcmp (mips_arch_info->name, "gs464e") == 0)
 +         || (strcmp (mips_arch_info->name, "gs264e") == 0)))
 +     target_flags |= MASK_LOONGSON_EXT;
 +
 +  /* Default to enable Loongson EXT2 on gs464e or gs264e target.  */
 +  if ((target_flags_explicit & MASK_LOONGSON_EXT2) == 0
 +     && ((strcmp (mips_arch_info->name, "gs464e") == 0)
 +        || (strcmp (mips_arch_info->name, "gs264e") == 0)))
 +    target_flags |= MASK_LOONGSON_EXT2;
 +
 +  /* Default to enable MSA on gs264e target.  */
 +  if ((target_flags_explicit & MASK_MSA) == 0
 +      && (strcmp (mips_arch_info->name, "gs264e") == 0))
 +    target_flags |= MASK_MSA;
 +
   /* .eh_frame addresses should be the same width as a C pointer.
      Most MIPS ABIs support only one pointer size, so the assembler
      will usually know exactly how big an .eh_frame address is.
 @@ -21330,12 +21376,12 @@ void mips_function_profiler (FILE *file)
 
 /* Implement TARGET_SHIFT_TRUNCATION_MASK.  We want to keep the default
    behavior of TARGET_SHIFT_TRUNCATION_MASK for non-vector modes even
 -   when TARGET_LOONGSON_VECTORS is true.  */
 +   when TARGET_LOONGSON_MMI is true.  */
 
 static unsigned HOST_WIDE_INT
 mips_shift_truncation_mask (machine_mode mode)
 {
 -  if (TARGET_LOONGSON_VECTORS && VECTOR_MODE_P (mode))
 +  if (TARGET_LOONGSON_MMI && VECTOR_MODE_P (mode))
     return 0;
 
   return GET_MODE_BITSIZE (mode) - 1;
 @@ -21436,7 +21482,7 @@ mips_expand_vpc_loongson_even_odd (struct expand_vec_perm_d *d)
   unsigned i, odd, nelt = d->nelt;
   rtx t0, t1, t2, t3;
 
 -  if (!(TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS))
 +  if (!(TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI))
     return false;
   /* Even-odd for V2SI/V2SFmode is matched by interleave directly.  */
   if (nelt < 4)
 @@ -21493,7 +21539,7 @@ mips_expand_vpc_loongson_pshufh (struct expand_vec_perm_d *d)
   unsigned i, mask;
   rtx rmask;
 
 -  if (!(TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS))
 +  if (!(TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI))
     return false;
   if (d->vmode != V4HImode)
     return false;
 @@ -21545,7 +21591,7 @@ mips_expand_vpc_loongson_bcast (struct expand_vec_perm_d *d)
   unsigned i, elt;
   rtx t0, t1;
 
 -  if (!(TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS))
 +  if (!(TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI))
     return false;
   /* Note that we've already matched V2SI via punpck and V4HI via pshufh.  */
   if (d->vmode != V8QImode)
 @@ -22139,7 +22185,7 @@ mips_expand_vector_init (rtx target, rtx vals)
     }
 
   /* Loongson is the only cpu with vectors with more elements.  */
 -  gcc_assert (TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS);
 +  gcc_assert (TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI);
 
   /* If all values are identical, broadcast the value.  */
   if (all_same)
 diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h
 index 84ae675..971e73d 100644
 --- a/gcc/config/mips/mips.h
 +++ b/gcc/config/mips/mips.h
 @@ -266,7 +266,9 @@ struct mips_cpu_info {
 #define TARGET_LOONGSON_2E          (mips_arch == PROCESSOR_LOONGSON_2E)
 #define TARGET_LOONGSON_2F          (mips_arch == PROCESSOR_LOONGSON_2F)
 #define TARGET_LOONGSON_2EF         (TARGET_LOONGSON_2E || TARGET_LOONGSON_2F)
 -#define TARGET_LOONGSON_3A          (mips_arch == PROCESSOR_LOONGSON_3A)
 +#define TARGET_GS464		    (mips_arch == PROCESSOR_GS464)
 +#define TARGET_GS464E		    (mips_arch == PROCESSOR_GS464E)
 +#define TARGET_GS264E		    (mips_arch == PROCESSOR_GS264E)
 #define TARGET_MIPS3900             (mips_arch == PROCESSOR_R3900)
 #define TARGET_MIPS4000             (mips_arch == PROCESSOR_R4000)
 #define TARGET_MIPS4120             (mips_arch == PROCESSOR_R4120)
 @@ -298,7 +300,9 @@ struct mips_cpu_info {
 				     || mips_tune == PROCESSOR_74KF3_2)
 #define TUNE_LOONGSON_2EF           (mips_tune == PROCESSOR_LOONGSON_2E	\
 				     || mips_tune == PROCESSOR_LOONGSON_2F)
 -#define TUNE_LOONGSON_3A            (mips_tune == PROCESSOR_LOONGSON_3A)
 +#define TUNE_GS464		    (mips_tune == PROCESSOR_GS464)
 +#define TUNE_GS464E		    (mips_tune == PROCESSOR_GS464E)
 +#define TUNE_GS264E		    (mips_tune == PROCESSOR_GS264E)
 #define TUNE_MIPS3000               (mips_tune == PROCESSOR_R3000)
 #define TUNE_MIPS3900               (mips_tune == PROCESSOR_R3900)
 #define TUNE_MIPS4000               (mips_tune == PROCESSOR_R4000)
 @@ -318,13 +322,6 @@ struct mips_cpu_info {
 #define TUNE_P5600                  (mips_tune == PROCESSOR_P5600)
 #define TUNE_I6400                  (mips_tune == PROCESSOR_I6400)
 
 -/* Whether vector modes and intrinsics for ST Microelectronics
 -   Loongson-2E/2F processors should be enabled.  In o32 pairs of
 -   floating-point registers provide 64-bit values.  */
 -#define TARGET_LOONGSON_VECTORS	    (TARGET_HARD_FLOAT_ABI		\
 -				     && (TARGET_LOONGSON_2EF		\
 -					 || TARGET_LOONGSON_3A))
 -
 /* True if the pre-reload scheduler should try to create chains of
    multiply-add or multiply-subtract instructions.  For example,
    suppose we have:
 @@ -595,9 +592,12 @@ struct mips_cpu_info {
       if (TARGET_ABICALLS)						\
 	builtin_define ("__mips_abicalls");				\
 									\
 -      /* Whether Loongson vector modes are enabled.  */                 \
 -      if (TARGET_LOONGSON_VECTORS)					\
 -        builtin_define ("__mips_loongson_vector_rev");                  \
 +      /* Whether Loongson vector modes are enabled.  */			\
 +      if (TARGET_LOONGSON_MMI)						\
 +	{								\
 +	  builtin_define ("__mips_loongson_vector_rev");		\
 +	  builtin_define ("__mips_loongson_mmi");			\
 +	}								\
 									\
       /* Historical Octeon macro.  */					\
       if (TARGET_OCTEON)						\
 @@ -779,7 +779,8 @@ struct mips_cpu_info {
      %{march=mips32r6: -mips32r6} \
      %{march=mips64|march=5k*|march=20k*|march=sb1*|march=sr71000 \
        |march=xlr: -mips64} \
 -     %{march=mips64r2|march=loongson3a|march=octeon|march=xlp: -mips64r2} \
 +     %{march=mips64r2|march=loongson3a|march=gs464|march=gs464e|march=gs264e \
 +       |march=octeon|march=xlp: -mips64r2} \
      %{march=mips64r3: -mips64r3} \
      %{march=mips64r5: -mips64r5} \
      %{march=mips64r6|march=i6400: -mips64r6}}"
 @@ -935,7 +936,7 @@ struct mips_cpu_info {
 
 /* ISA has 32 single-precision registers.  */
 #define ISA_HAS_ODD_SPREG	((mips_isa_rev >= 1			\
 -				  && !TARGET_LOONGSON_3A)		\
 +				  && !TARGET_GS464)			\
 				 || TARGET_FLOAT64			\
 				 || TARGET_MIPS5900)
 
 @@ -978,7 +979,7 @@ struct mips_cpu_info {
    because the former are faster and can also have the effect of reducing
    code size.  */
 #define ISA_AVOID_DIV_HILO	((TARGET_LOONGSON_2EF			\
 -				  || TARGET_LOONGSON_3A)		\
 +				  || TARGET_GS464)			\
 				 && !TARGET_MIPS16)
 
 /* ISA supports instructions DDIV and DDIVU. */
 @@ -1071,14 +1072,14 @@ struct mips_cpu_info {
    'd = [+-] (a * b [+-] c)'.  */
 #define ISA_HAS_FUSED_MADD4	(mips_madd4				\
 				 && (TARGET_MIPS8000			\
 -				     || TARGET_LOONGSON_3A))
 +				     || TARGET_GS464))
 
 /* ISA has 4 operand unfused madd instructions of the form
    'd = [+-] (a * b [+-] c)'.  */
 #define ISA_HAS_UNFUSED_MADD4	(mips_madd4				\
 				 && ISA_HAS_FP4				\
 				 && !TARGET_MIPS8000			\
 -				 && !TARGET_LOONGSON_3A)
 +				 && !TARGET_GS464)
 
 /* ISA has 3 operand r6 fused madd instructions of the form
    'c = c [+-] (a * b)'.  */
 @@ -1114,6 +1115,9 @@ struct mips_cpu_info {
 /* ISA has count leading zeroes/ones instruction (not implemented).  */
 #define ISA_HAS_CLZ_CLO		(mips_isa_rev >= 1 && !TARGET_MIPS16)
 
 +/* ISA has count tailing zeroes/ones instruction (not implemented).  */
 +#define ISA_HAS_CTZ_CTO		(TARGET_LOONGSON_EXT2)
 +
 /* ISA has three operand multiply instructions that put
    the high part in an accumulator: mulhi or mulhiu.  */
 #define ISA_HAS_MULHI		((TARGET_MIPS5400			 \
 @@ -1355,6 +1359,7 @@ struct mips_cpu_info {
 %{mvirt} %{mno-virt} \
 %{mxpa} %{mno-xpa} \
 %{mmsa} %{mno-msa} \
 +%{mloongson-mmi} %{mno-loongson-mmi} \
 %{msmartmips} %{mno-smartmips} \
 %{mmt} %{mno-mt} \
 %{mfix-rm7000} %{mno-fix-rm7000} \
 @@ -2631,9 +2636,9 @@ typedef struct mips_args {
 #define SLOW_BYTE_ACCESS (!TARGET_MIPS16)
 
 /* Standard MIPS integer shifts truncate the shift amount to the
 -   width of the shifted operand.  However, Loongson vector shifts
 +   width of the shifted operand.  However, Loongson MMI shifts
    do not truncate the shift amount at all.  */
 -#define SHIFT_COUNT_TRUNCATED (!TARGET_LOONGSON_VECTORS)
 +#define SHIFT_COUNT_TRUNCATED (!TARGET_LOONGSON_MMI)
 
 
 /* Specify the machine mode that pointers have.
 diff --git a/gcc/config/mips/mips.md b/gcc/config/mips/mips.md
 index 1d95348..c5e50a4 100644
 --- a/gcc/config/mips/mips.md
 +++ b/gcc/config/mips/mips.md
 @@ -37,7 +37,9 @@
   74kf3_2
   loongson_2e
   loongson_2f
 -  loongson_3a
 +  gs464
 +  gs464e
 +  gs264e
   m4k
   octeon
   octeon2
 @@ -334,6 +336,7 @@
 ;; slt		set less than instructions
 ;; signext      sign extend instructions
 ;; clz		the clz and clo instructions
 +;; ctz		the ctz and cto instructions
 ;; pop		the pop instruction
 ;; trap		trap if instructions
 ;; imul		integer multiply 2 operands
 @@ -374,7 +377,7 @@
 (define_attr "type"
   "unknown,branch,jump,call,load,fpload,fpidxload,store,fpstore,fpidxstore,
    prefetch,prefetchx,condmove,mtc,mfc,mthi,mtlo,mfhi,mflo,const,arith,logical,
 -   shift,slt,signext,clz,pop,trap,imul,imul3,imul3nc,imadd,idiv,idiv3,move,
 +   shift,slt,signext,clz,ctz,pop,trap,imul,imul3,imul3nc,imadd,idiv,idiv3,move,
    fmove,fadd,fmul,fmadd,fdiv,frdiv,frdiv1,frdiv2,fabs,fneg,fcmp,fcvt,fsqrt,
    frsqrt,frsqrt1,frsqrt2,dspmac,dspmacsat,accext,accmod,dspalu,dspalusat,
    multi,atomic,syncloop,nop,ghost,multimem,
 @@ -833,9 +836,9 @@
 (define_mode_iterator MOVE64
   [DI DF
    (V2SF "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT")
 -   (V2SI "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS")
 -   (V4HI "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS")
 -   (V8QI "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS")])
 +   (V2SI "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI")
 +   (V4HI "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI")
 +   (V8QI "TARGET_HARD_FLOAT && TARGET_LOONGSON_MMI")])
 
 ;; 128-bit modes for which we provide move patterns on 64-bit targets.
 (define_mode_iterator MOVE128 [TI TF])
 @@ -862,9 +865,9 @@
   [(DF "!TARGET_64BIT && TARGET_DOUBLE_FLOAT")
    (DI "!TARGET_64BIT && TARGET_DOUBLE_FLOAT")
    (V2SF "!TARGET_64BIT && TARGET_PAIRED_SINGLE_FLOAT")
 -   (V2SI "!TARGET_64BIT && TARGET_LOONGSON_VECTORS")
 -   (V4HI "!TARGET_64BIT && TARGET_LOONGSON_VECTORS")
 -   (V8QI "!TARGET_64BIT && TARGET_LOONGSON_VECTORS")
 +   (V2SI "!TARGET_64BIT && TARGET_LOONGSON_MMI")
 +   (V4HI "!TARGET_64BIT && TARGET_LOONGSON_MMI")
 +   (V8QI "!TARGET_64BIT && TARGET_LOONGSON_MMI")
    (TF "TARGET_64BIT && TARGET_FLOAT64")])
 
 ;; In GPR templates, a string like "<d>subu" will expand to "subu" in the
 @@ -1181,7 +1184,9 @@
 (include "9000.md")
 (include "10000.md")
 (include "loongson2ef.md")
 -(include "loongson3a.md")
 +(include "gs464.md")
 +(include "gs464e.md")
 +(include "gs264e.md")
 (include "octeon.md")
 (include "sb1.md")
 (include "sr71k.md")
 @@ -1608,7 +1613,7 @@
 {
   rtx lo;
 
 -  if (TARGET_LOONGSON_2EF || TARGET_LOONGSON_3A || ISA_HAS_R6<D>MUL)
 +  if (TARGET_LOONGSON_2EF || TARGET_LOONGSON_EXT || ISA_HAS_R6<D>MUL)
     emit_insn (gen_mul<mode>3_mul3_nohilo (operands[0], operands[1],
 					   operands[2]));
   else if (ISA_HAS_<D>MUL3)
 @@ -1632,11 +1637,11 @@
         (mult:GPR (match_operand:GPR 1 "register_operand" "d")
                   (match_operand:GPR 2 "register_operand" "d")))
 	          (clobber (match_scratch:GPR 3  "=l"))]
 -  "TARGET_LOONGSON_2EF || TARGET_LOONGSON_3A || ISA_HAS_R6<D>MUL"
 +  "TARGET_LOONGSON_2EF || TARGET_LOONGSON_EXT || ISA_HAS_R6<D>MUL"
 {
   if (TARGET_LOONGSON_2EF)
     return "<d>mult.g\t%0,%1,%2";
 -  else if (TARGET_LOONGSON_3A)
 +  else if (TARGET_LOONGSON_EXT)
     return "gs<d>mult\t%0,%1,%2";
   else
     return "<d>mul\t%0,%1,%2";
 @@ -3026,11 +3031,11 @@
   [(set (match_operand:GPR 0 "register_operand" "=&d")
 	(any_div:GPR (match_operand:GPR 1 "register_operand" "d")
 		     (match_operand:GPR 2 "register_operand" "d")))]
 -  "TARGET_LOONGSON_2EF || TARGET_LOONGSON_3A || ISA_HAS_R6<D>DIV"
 +  "TARGET_LOONGSON_2EF || TARGET_LOONGSON_EXT || ISA_HAS_R6<D>DIV"
   {
     if (TARGET_LOONGSON_2EF)
       return mips_output_division ("<d>div<u>.g\t%0,%1,%2", operands);
 -    else if (TARGET_LOONGSON_3A)
 +    else if (TARGET_LOONGSON_EXT)
       return mips_output_division ("gs<d>div<u>\t%0,%1,%2", operands);
     else
       return mips_output_division ("<d>div<u>\t%0,%1,%2", operands);
 @@ -3042,11 +3047,11 @@
   [(set (match_operand:GPR 0 "register_operand" "=&d")
 	(any_mod:GPR (match_operand:GPR 1 "register_operand" "d")
 		     (match_operand:GPR 2 "register_operand" "d")))]
 -  "TARGET_LOONGSON_2EF || TARGET_LOONGSON_3A || ISA_HAS_R6<D>DIV"
 +  "TARGET_LOONGSON_2EF || TARGET_LOONGSON_EXT || ISA_HAS_R6<D>DIV"
   {
     if (TARGET_LOONGSON_2EF)
       return "<d>mod<u>.g\t%0,%1,%2";
 -    else if (TARGET_LOONGSON_3A)
 +    else if (TARGET_LOONGSON_EXT)
       return "gs<d>mod<u>\t%0,%1,%2";
     else
       return mips_output_division ("<d>mod<u>\t%0,%1,%2", operands);
 @@ -3159,6 +3164,23 @@
 ;;
 ;;  ...................
 ;;
 +;;  Count tailing zeroes.
 +;;
 +;;  ...................
 +;;
 +
 +(define_insn "ctz<mode>2"
 +  [(set (match_operand:GPR 0 "register_operand" "=d")
 +	(ctz:GPR (match_operand:GPR 1 "register_operand" "d")))]
 +  "ISA_HAS_CTZ_CTO"
 +  "<d>ctz\t%0,%1"
 +  [(set_attr "type" "ctz")
 +   (set_attr "mode" "<MODE>")])
 +
 +
 +;;
 +;;  ...................
 +;;
 ;;  Count number of set bits.
 ;;
 ;;  ...................
 @@ -4892,7 +4914,7 @@
         (mem:GPR
                 (plus:P (match_operand:P 1 "register_operand" "d")
                         (match_operand:P 2 "register_operand" "d"))))]
 -  "TARGET_LOONGSON_3A && TARGET_64BIT"
 +  "TARGET_LOONGSON_EXT && TARGET_64BIT"
   "<GPR:gsloadx>\t%0,0(%1,%2)"
   [(set_attr "type" "load")
    (set_attr "mode" "<GPR:MODE>")])
 @@ -4901,7 +4923,7 @@
   [(set (mem:GPR (plus:P (match_operand:P 1 "register_operand" "d")
                          (match_operand:P 2 "register_operand" "d")))
         (match_operand:GPR 0 "register_operand" "d"))]
 -  "TARGET_LOONGSON_3A && TARGET_64BIT"
 +  "TARGET_LOONGSON_EXT && TARGET_64BIT"
   "<GPR:gsstorex>\t%0,0(%1,%2)"
   [(set_attr "type" "store")
    (set_attr "mode" "<GPR:MODE>")])
 @@ -4913,7 +4935,7 @@
           (mem:SHORT
             (plus:P (match_operand:P 1 "register_operand" "d")
                     (match_operand:P 2 "register_operand" "d")))))]
 -  "TARGET_LOONGSON_3A && TARGET_64BIT"
 +  "TARGET_LOONGSON_EXT && TARGET_64BIT"
   "<SHORT:gsloadx>\t%0,0(%1,%2)"
   [(set_attr "type" "load")
    (set_attr "mode" "<GPR:MODE>")])
 @@ -4922,7 +4944,7 @@
   [(set (mem:SHORT (plus:P (match_operand:P 1 "register_operand" "d")
                            (match_operand:P 2 "register_operand" "d")))
         (match_operand:SHORT 0 "register_operand" "d"))]
 -  "TARGET_LOONGSON_3A && TARGET_64BIT"
 +  "TARGET_LOONGSON_EXT && TARGET_64BIT"
   "<SHORT:gsstorex>\t%0,0(%1,%2)"
   [(set_attr "type" "store")
    (set_attr "mode" "SI")])
 @@ -5089,7 +5111,7 @@
 (define_insn "movsf_zero"
   [(set (match_operand:SF 0 "register_operand" "=f")
         (match_operand:SF 1 "const_0_operand" ""))]
 -  "TARGET_LOONGSON_3A"
 +  "TARGET_LOONGSON_EXT"
   "xor %0,%0,%0"
   [(set_attr "type" "logical")
    (set_attr "mode" "SF")])
 @@ -5150,7 +5172,7 @@
 (define_insn "movdf_zero"
   [(set (match_operand:DF 0 "register_operand" "=f")
         (match_operand:DF 1 "const_0_operand" ""))]
 -  "TARGET_LOONGSON_3A"
 +  "TARGET_LOONGSON_EXT"
   "xor %0,%0,%0"
   [(set_attr "type" "logical")
    (set_attr "mode" "DF")])
 @@ -7226,9 +7248,11 @@
 	     (match_operand 2 "const_int_operand" "n"))]
   "ISA_HAS_PREFETCH && TARGET_EXPLICIT_RELOCS"
 {
 -  if (TARGET_LOONGSON_2EF || TARGET_LOONGSON_3A)
 +  if (TARGET_LOONGSON_2EF || TARGET_LOONGSON_EXT || TARGET_LOONGSON_EXT2)
     {
 -      /* Loongson 2[ef] and Loongson 3a use load to $0 for prefetching.  */
 +      /* Loongson ext2 implementation pref insnstructions.  */
 +      if (TARGET_LOONGSON_EXT2)
 +        return "pref\t%1, %a0";
       if (TARGET_64BIT)
         return "ld\t$0,%a0";
       else
 @@ -7780,8 +7804,8 @@
 ; microMIPS patterns.
 (include "micromips.md")
 
 -; ST-Microelectronics Loongson-2E/2F-specific patterns.
 -(include "loongson.md")
 +; Loongson MultiMedia extensions Instructions (MMI) patterns.
 +(include "loongson-mmi.md")
 
 ; The MIPS MSA Instructions.
 (include "mips-msa.md")
 diff --git a/gcc/config/mips/mips.opt b/gcc/config/mips/mips.opt
 index 4c0de02..92cdac4 100644
 --- a/gcc/config/mips/mips.opt
 +++ b/gcc/config/mips/mips.opt
 @@ -300,7 +300,7 @@ Target Report Mask(MICROMIPS)
 Use microMIPS instructions.
 
 mmsa
 -Target Report Var(TARGET_MSA)
 +Target Report Mask(MSA)
 Use MIPS MSA Extension instructions.
 
 mmt
 @@ -455,3 +455,15 @@ Enum(mips_cb_setting) String(optimal) Value(MIPS_CB_OPTIMAL)
 
 EnumValue
 Enum(mips_cb_setting) String(always) Value(MIPS_CB_ALWAYS)
 +
 +mloongson-mmi
 +Target Report Mask(LOONGSON_MMI)
 +Use Loongson MultiMedia extensions Instructions (MMI) instructions.
 +
 +mloongson-ext
 +Target Report Mask(LOONGSON_EXT)
 +Use Loongson EXTension (EXT) instructions.
 +
 +mloongson-ext2
 +Target Report Mask(LOONGSON_EXT2)
 +Use Loongson EXTension R2 (EXT2) instructions.
 diff --git a/gcc/config/mips/t-st b/gcc/config/mips/t-st
 index ec22d93..0791759 100644
 --- a/gcc/config/mips/t-st
 +++ b/gcc/config/mips/t-st
 @@ -16,8 +16,8 @@
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
 -MULTILIB_OPTIONS = march=loongson3a/march=loongson2e/march=loongson2f mabi=n32/mabi=32/mabi=64
 -MULTILIB_DIRNAMES = 3a 2e 2f lib32 lib lib64
 +MULTILIB_OPTIONS = march=loongson3a/march=loongson2e/march=loongson2f/march=gs464/march=gs464e/march=gs264e mabi=n32/mabi=32/mabi=64
 +MULTILIB_DIRNAMES = 3a 2e 2f gs464 gs464e gs264e lib32 lib lib64
 
 MULTILIB_OSDIRNAMES  = march.loongson2e/mabi.n32=../lib32/2e
 MULTILIB_OSDIRNAMES += march.loongson2e/mabi.32=../lib/2e
 @@ -28,6 +28,15 @@ MULTILIB_OSDIRNAMES += march.loongson2f/mabi.64=../lib64/2f
 MULTILIB_OSDIRNAMES += march.loongson3a/mabi.n32=../lib32/3a
 MULTILIB_OSDIRNAMES += march.loongson3a/mabi.32=../lib/3a
 MULTILIB_OSDIRNAMES += march.loongson3a/mabi.64=../lib64/3a
 +MULTILIB_OSDIRNAMES += march.gs464/mabi.n32=../lib32/gs464
 +MULTILIB_OSDIRNAMES += march.gs464/mabi.32=../lib/gs464
 +MULTILIB_OSDIRNAMES += march.gs464/mabi.64=../lib64/gs464
 +MULTILIB_OSDIRNAMES += march.gs464e/mabi.n32=../lib32/gs464e
 +MULTILIB_OSDIRNAMES += march.gs464e/mabi.32=../lib/gs464e
 +MULTILIB_OSDIRNAMES += march.gs464e/mabi.64=../lib64/gs464e
 +MULTILIB_OSDIRNAMES += march.gs264e/mabi.n32=../lib32/gs264e
 +MULTILIB_OSDIRNAMES += march.gs264e/mabi.32=../lib/gs264e
 +MULTILIB_OSDIRNAMES += march.gs264e/mabi.64=../lib64/gs264e
 MULTILIB_OSDIRNAMES += mabi.n32=../lib32
 MULTILIB_OSDIRNAMES += mabi.32=../lib
 MULTILIB_OSDIRNAMES += mabi.64=../lib64
 diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
 index 4441591..6b42028 100644
 --- a/gcc/doc/invoke.texi
 +++ b/gcc/doc/invoke.texi
 @@ -20171,7 +20171,8 @@ The processor names are:
 @samp{1004kc}, @samp{1004kf2_1}, @samp{1004kf1_1},
 @samp{i6400},
 @samp{interaptiv},
 -@samp{loongson2e}, @samp{loongson2f}, @samp{loongson3a},
 +@samp{loongson2e}, @samp{loongson2f}, @samp{gs464}, @samp{gs464e},
 +@samp{gs264e},
 @samp{m4k},
 @samp{m14k}, @samp{m14kc}, @samp{m14ke}, @samp{m14kec},
 @samp{m5100}, @samp{m5101},
 diff --git a/gcc/testsuite/gcc.target/mips/loongson-ctz.c b/gcc/testsuite/gcc.target/mips/loongson-ctz.c
 new file mode 100644
 index 0000000..8df66a0
 --- /dev/null
 +++ b/gcc/testsuite/gcc.target/mips/loongson-ctz.c
 @@ -0,0 +1,11 @@
 +/* Test cases for Loongson EXT2 instrutions.  */
 +
 +/* { dg-do compile } */
 +/* { dg-options "-mloongson-ext2" } */
 +
 +unsigned int foo(unsigned int x)
 +{
 +  return __builtin_ctz (x);
 +}
 +
 +/* { dg-final { scan-assembler "ctz\t" } } */
 diff --git a/gcc/testsuite/gcc.target/mips/loongson-dctz.c b/gcc/testsuite/gcc.target/mips/loongson-dctz.c
 new file mode 100644
 index 0000000..8c47433
 --- /dev/null
 +++ b/gcc/testsuite/gcc.target/mips/loongson-dctz.c
 @@ -0,0 +1,11 @@
 +/* Test cases for Loongson EXT2 instrutions.  */
 +
 +/* { dg-do compile } */
 +/* { dg-options "-mloongson-ext2" } */
 +
 +unsigned long long foo(unsigned long long x)
 +{
 +  return __builtin_ctzl (x);
 +}
 +
 +/* { dg-final { scan-assembler "dctz\t" } } */
 diff --git a/gcc/testsuite/gcc.target/mips/loongson-shift-count-truncated-1.c b/gcc/testsuite/gcc.target/mips/loongson-shift-count-truncated-1.c
 index baed48c..6e22c0e 100644
 --- a/gcc/testsuite/gcc.target/mips/loongson-shift-count-truncated-1.c
 +++ b/gcc/testsuite/gcc.target/mips/loongson-shift-count-truncated-1.c
 @@ -4,11 +4,11 @@
 /* loongson.h does not handle or check for MIPS16ness.  There doesn't
    seem any good reason for it to, given that the Loongson processors
    do not support MIPS16.  */
 -/* { dg-options "isa=loongson -mhard-float -mno-mips16 (REQUIRES_STDLIB)" } */
 +/* { dg-options "-mloongson-mmi -mhard-float -mno-mips16 (REQUIRES_STDLIB)" } */
 /* See PR 52155.  */
 -/* { dg-options "isa=loongson -mhard-float -mno-mips16 -mlong64" { mips*-*-elf* && ilp32 } } */
 +/* { dg-options "-mloongson-mmi -mhard-float -mno-mips16 -mlong64" { mips*-*-elf* && ilp32 } } */
 
 -#include "loongson.h"
 +#include "loongson-mmiintrin.h"
 #include <assert.h>
 
 typedef union { int32x2_t v; int32_t a[2]; } int32x2_encap_t;
 diff --git a/gcc/testsuite/gcc.target/mips/loongson-simd.c b/gcc/testsuite/gcc.target/mips/loongson-simd.c
 index f263b43..34fdcec 100644
 --- a/gcc/testsuite/gcc.target/mips/loongson-simd.c
 +++ b/gcc/testsuite/gcc.target/mips/loongson-simd.c
 @@ -26,9 +26,9 @@ along with GCC; see the file COPYING3.  If not see
    because inclusion of some system headers e.g. stdint.h will fail due to not
    finding stubs-o32_hard.h.  */
 /* { dg-require-effective-target mips_nanlegacy } */
 -/* { dg-options "isa=loongson -mhard-float -mno-micromips -mno-mips16 -flax-vector-conversions (REQUIRES_STDLIB)" } */
 +/* { dg-options "-mloongson-mmi -mhard-float -mno-micromips -mno-mips16 -flax-vector-conversions (REQUIRES_STDLIB)" } */
 
 -#include "loongson.h"
 +#include "loongson-mmiintrin.h"
 #include <stdio.h>
 #include <stdint.h>
 #include <assert.h>
 diff --git a/gcc/testsuite/gcc.target/mips/mips.exp b/gcc/testsuite/gcc.target/mips/mips.exp
 index 9db4fbe..5b2bf8b 100644
 --- a/gcc/testsuite/gcc.target/mips/mips.exp
 +++ b/gcc/testsuite/gcc.target/mips/mips.exp
 @@ -296,6 +296,9 @@ foreach option {
     mcount-ra-address
     odd-spreg
     msa
 +    loongson-mmi
 +    loongson-ext
 +    loongson-ext2
 } {
     lappend mips_option_groups $option "-m(no-|)$option"
 }
 @@ -883,6 +886,12 @@ proc mips-dg-init {} {
 	    "-mno-msa"
 	    #endif
 
 +	    #ifdef __mips_loongson_mmi
 +	    "-mloongson-mmi"
 +	    #else
 +	    "-mno-loongson-mmi"
 +	    #endif
 +
 	    0
 	};
     } 0]
 diff --git a/gcc/testsuite/gcc.target/mips/umips-store16-1.c b/gcc/testsuite/gcc.target/mips/umips-store16-1.c
 index 6377e85..f82c837 100644
 --- a/gcc/testsuite/gcc.target/mips/umips-store16-1.c
 +++ b/gcc/testsuite/gcc.target/mips/umips-store16-1.c
 @@ -1,4 +1,4 @@
 -/* { dg-options "(-mmicromips)" } */
 +/* { dg-options "(-mmicromips) forbid_cpu=loongson3a" } */
 /* { dg-do assemble } */
 
 register unsigned int global asm ("$16");
 diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
 index 50665df..326bd30 100644
 --- a/gcc/testsuite/lib/target-supports.exp
 +++ b/gcc/testsuite/lib/target-supports.exp
 @@ -1902,20 +1902,20 @@ proc check_mpaired_single_hw_available { } {
 # Return 1 if the target supports executing Loongson vector instructions,
 # 0 otherwise.  Cache the result.
 
 -proc check_mips_loongson_hw_available { } {
 -    return [check_cached_effective_target mips_loongson_hw_available {
 +proc check_mips_loongson_mmi_hw_available { } {
 +    return [check_cached_effective_target mips_loongson_mmi_hw_available {
 	# If this is not the right target then we can skip the test.
 	if { !([istarget mips*-*-*]) } {
 	    expr 0
 	} else {
 -	    check_runtime_nocache mips_loongson_hw_available {
 -	      #include <loongson.h>
 +	    check_runtime_nocache mips_loongson_mmi_hw_available {
 +	      #include <loongson-mmiintrin.h>
 	      int main()
 	      {
 		asm volatile ("paddw $f2,$f4,$f6");
 		return 0;
 	      }
 -	    } ""
 +	    } "-mloongson-mmi"
 	}
     }]
 }
 @@ -1969,9 +1969,9 @@ proc check_effective_target_mpaired_single_runtime { } {
 
 # Return 1 if the target supports running Loongson executables, 0 otherwise.
 
 -proc check_effective_target_mips_loongson_runtime { } {
 -    if { [check_effective_target_mips_loongson]
 -	 && [check_mips_loongson_hw_available] } {
 +proc check_effective_target_mips_loongson_mmi_runtime { } {
 +    if { [check_effective_target_mips_loongson_mmi]
 +	 && [check_mips_loongson_mmi_hw_available] } {
 	return 1
     }
     return 0
 @@ -3070,7 +3070,7 @@ proc check_effective_target_vect_int { } {
 	     || [istarget aarch64*-*-*]
 	     || [is-effective-target arm_neon]
 	     || ([istarget mips*-*-*]
 -		 && ([et-is-effective-target mips_loongson]
 +		 && ([et-is-effective-target mips_loongson_mmi]
 		     || [et-is-effective-target mips_msa]))
 	     || ([istarget s390*-*-*]
 		 && [check_effective_target_s390_vx]) } {
 @@ -4808,11 +4808,24 @@ proc add_options_for_mips_msa { flags } {
   return "$flags -mmsa"
 }
 
 +# Add the options needed for MIPS Loongsn MMI Architecture.
 +
 +proc add_options_for_mips_loongson_mmi { flags } {
 +  if { ! [check_effective_target_mips_loongson_mmi] } {
 +    return "$flags"
 +  }
 +  return "$flags -mloongson-mmi"
 +}
 +
 +
 # Return 1 if this a Loongson-2E or -2F target using an ABI that supports
 # the Loongson vector modes.
 
 -proc check_effective_target_mips_loongson { } {
 +proc check_effective_target_mips_loongson_mmi { } {
     return [check_no_compiler_messages loongson assembly {
 +	#if !defined(__mips_loongson_mmi)
 +	#error !__mips_loongson_mmi
 +	#endif
 	#if !defined(__mips_loongson_vector_rev)
 	#error !__mips_loongson_vector_rev
 	#endif
 @@ -5387,7 +5400,7 @@ proc check_effective_target_vect_shift { } {
 	     || [is-effective-target arm_neon]
 	     || ([istarget mips*-*-*]
 		 && ([et-is-effective-target mips_msa]
 -		     || [et-is-effective-target mips_loongson]))
 +		     || [et-is-effective-target mips_loongson_mmi]))
 	     || ([istarget s390*-*-*]
 		 && [check_effective_target_s390_vx]) } {
 	   set et_vect_shift_saved($et_index) 1
 @@ -5407,7 +5420,7 @@ proc check_effective_target_whole_vector_shift { } {
 	 || ([is-effective-target arm_neon]
 	     && [check_effective_target_arm_little_endian])
 	 || ([istarget mips*-*-*]
 -	     && [et-is-effective-target mips_loongson])
 +	     && [et-is-effective-target mips_loongson_mmi])
 	 || ([istarget s390*-*-*]
 	     && [check_effective_target_s390_vx]) } {
 	set answer 1
 @@ -5613,7 +5626,7 @@ proc check_effective_target_vect_no_int_min_max { } {
 	     || [istarget spu-*-*]
 	     || [istarget alpha*-*-*]
 	     || ([istarget mips*-*-*]
 -		 && [et-is-effective-target mips_loongson]) } {
 +		 && [et-is-effective-target mips_loongson_mmi]) } {
 	    set et_vect_no_int_min_max_saved($et_index) 1
 	}
     }
 @@ -6384,7 +6397,7 @@ proc check_effective_target_vect_no_align { } {
 	     || [check_effective_target_arm_vect_no_misalign]
 	     || ([istarget powerpc*-*-*] && [check_p8vector_hw_available])
 	     || ([istarget mips*-*-*]
 -		 && [et-is-effective-target mips_loongson]) } {
 +		 && [et-is-effective-target mips_loongson_mmi]) } {
 	    set et_vect_no_align_saved($et_index) 1
 	}
     }
 @@ -6714,7 +6727,7 @@ proc check_effective_target_vect_short_mult { } {
 	     || [check_effective_target_arm32]
 	     || ([istarget mips*-*-*]
 		 && ([et-is-effective-target mips_msa]
 -		     || [et-is-effective-target mips_loongson]))
 +		     || [et-is-effective-target mips_loongson_mmi]))
 	     || ([istarget s390*-*-*]
 		 && [check_effective_target_s390_vx]) } {
 	   set et_vect_short_mult_saved($et_index) 1
 @@ -8529,8 +8542,8 @@ proc check_vect_support_and_set_flags { } {
 	if { [check_effective_target_mpaired_single] } {
 	    lappend EFFECTIVE_TARGETS mpaired_single
 	}
 -	if { [check_effective_target_mips_loongson] } {
 -	    lappend EFFECTIVE_TARGETS mips_loongson
 +	if { [check_effective_target_mips_loongson_mmi] } {
 +	    lappend EFFECTIVE_TARGETS mips_loongson_mmi
 	}
 	if { [check_effective_target_mips_msa] } {
 	    lappend EFFECTIVE_TARGETS mips_msa
No results found